btree.c 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451
  1. /*
  2. * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
  3. *
  4. * Uses a block device as cache for other block devices; optimized for SSDs.
  5. * All allocation is done in buckets, which should match the erase block size
  6. * of the device.
  7. *
  8. * Buckets containing cached data are kept on a heap sorted by priority;
  9. * bucket priority is increased on cache hit, and periodically all the buckets
  10. * on the heap have their priority scaled down. This currently is just used as
  11. * an LRU but in the future should allow for more intelligent heuristics.
  12. *
  13. * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
  14. * counter. Garbage collection is used to remove stale pointers.
  15. *
  16. * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
  17. * as keys are inserted we only sort the pages that have not yet been written.
  18. * When garbage collection is run, we resort the entire node.
  19. *
  20. * All configuration is done via sysfs; see Documentation/bcache.txt.
  21. */
  22. #include "bcache.h"
  23. #include "btree.h"
  24. #include "debug.h"
  25. #include "request.h"
  26. #include "writeback.h"
  27. #include <linux/slab.h>
  28. #include <linux/bitops.h>
  29. #include <linux/hash.h>
  30. #include <linux/prefetch.h>
  31. #include <linux/random.h>
  32. #include <linux/rcupdate.h>
  33. #include <trace/events/bcache.h>
  34. /*
  35. * Todo:
  36. * register_bcache: Return errors out to userspace correctly
  37. *
  38. * Writeback: don't undirty key until after a cache flush
  39. *
  40. * Create an iterator for key pointers
  41. *
  42. * On btree write error, mark bucket such that it won't be freed from the cache
  43. *
  44. * Journalling:
  45. * Check for bad keys in replay
  46. * Propagate barriers
  47. * Refcount journal entries in journal_replay
  48. *
  49. * Garbage collection:
  50. * Finish incremental gc
  51. * Gc should free old UUIDs, data for invalid UUIDs
  52. *
  53. * Provide a way to list backing device UUIDs we have data cached for, and
  54. * probably how long it's been since we've seen them, and a way to invalidate
  55. * dirty data for devices that will never be attached again
  56. *
  57. * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
  58. * that based on that and how much dirty data we have we can keep writeback
  59. * from being starved
  60. *
  61. * Add a tracepoint or somesuch to watch for writeback starvation
  62. *
  63. * When btree depth > 1 and splitting an interior node, we have to make sure
  64. * alloc_bucket() cannot fail. This should be true but is not completely
  65. * obvious.
  66. *
  67. * Make sure all allocations get charged to the root cgroup
  68. *
  69. * Plugging?
  70. *
  71. * If data write is less than hard sector size of ssd, round up offset in open
  72. * bucket to the next whole sector
  73. *
  74. * Also lookup by cgroup in get_open_bucket()
  75. *
  76. * Superblock needs to be fleshed out for multiple cache devices
  77. *
  78. * Add a sysfs tunable for the number of writeback IOs in flight
  79. *
  80. * Add a sysfs tunable for the number of open data buckets
  81. *
  82. * IO tracking: Can we track when one process is doing io on behalf of another?
  83. * IO tracking: Don't use just an average, weigh more recent stuff higher
  84. *
  85. * Test module load/unload
  86. */
  87. static const char * const op_types[] = {
  88. "insert", "replace"
  89. };
  90. static const char *op_type(struct btree_op *op)
  91. {
  92. return op_types[op->type];
  93. }
  94. #define MAX_NEED_GC 64
  95. #define MAX_SAVE_PRIO 72
  96. #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
  97. #define PTR_HASH(c, k) \
  98. (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
  99. struct workqueue_struct *bch_gc_wq;
  100. static struct workqueue_struct *btree_io_wq;
  101. void bch_btree_op_init_stack(struct btree_op *op)
  102. {
  103. memset(op, 0, sizeof(struct btree_op));
  104. closure_init_stack(&op->cl);
  105. op->lock = -1;
  106. bch_keylist_init(&op->keys);
  107. }
  108. /* Btree key manipulation */
  109. static void bkey_put(struct cache_set *c, struct bkey *k, int level)
  110. {
  111. if ((level && KEY_OFFSET(k)) || !level)
  112. __bkey_put(c, k);
  113. }
  114. /* Btree IO */
  115. static uint64_t btree_csum_set(struct btree *b, struct bset *i)
  116. {
  117. uint64_t crc = b->key.ptr[0];
  118. void *data = (void *) i + 8, *end = end(i);
  119. crc = bch_crc64_update(crc, data, end - data);
  120. return crc ^ 0xffffffffffffffffULL;
  121. }
  122. void bch_btree_node_read_done(struct btree *b)
  123. {
  124. const char *err = "bad btree header";
  125. struct bset *i = b->sets[0].data;
  126. struct btree_iter *iter;
  127. iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
  128. iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
  129. iter->used = 0;
  130. if (!i->seq)
  131. goto err;
  132. for (;
  133. b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
  134. i = write_block(b)) {
  135. err = "unsupported bset version";
  136. if (i->version > BCACHE_BSET_VERSION)
  137. goto err;
  138. err = "bad btree header";
  139. if (b->written + set_blocks(i, b->c) > btree_blocks(b))
  140. goto err;
  141. err = "bad magic";
  142. if (i->magic != bset_magic(b->c))
  143. goto err;
  144. err = "bad checksum";
  145. switch (i->version) {
  146. case 0:
  147. if (i->csum != csum_set(i))
  148. goto err;
  149. break;
  150. case BCACHE_BSET_VERSION:
  151. if (i->csum != btree_csum_set(b, i))
  152. goto err;
  153. break;
  154. }
  155. err = "empty set";
  156. if (i != b->sets[0].data && !i->keys)
  157. goto err;
  158. bch_btree_iter_push(iter, i->start, end(i));
  159. b->written += set_blocks(i, b->c);
  160. }
  161. err = "corrupted btree";
  162. for (i = write_block(b);
  163. index(i, b) < btree_blocks(b);
  164. i = ((void *) i) + block_bytes(b->c))
  165. if (i->seq == b->sets[0].data->seq)
  166. goto err;
  167. bch_btree_sort_and_fix_extents(b, iter);
  168. i = b->sets[0].data;
  169. err = "short btree key";
  170. if (b->sets[0].size &&
  171. bkey_cmp(&b->key, &b->sets[0].end) < 0)
  172. goto err;
  173. if (b->written < btree_blocks(b))
  174. bch_bset_init_next(b);
  175. out:
  176. mempool_free(iter, b->c->fill_iter);
  177. return;
  178. err:
  179. set_btree_node_io_error(b);
  180. bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
  181. err, PTR_BUCKET_NR(b->c, &b->key, 0),
  182. index(i, b), i->keys);
  183. goto out;
  184. }
  185. static void btree_node_read_endio(struct bio *bio, int error)
  186. {
  187. struct closure *cl = bio->bi_private;
  188. closure_put(cl);
  189. }
  190. void bch_btree_node_read(struct btree *b)
  191. {
  192. uint64_t start_time = local_clock();
  193. struct closure cl;
  194. struct bio *bio;
  195. trace_bcache_btree_read(b);
  196. closure_init_stack(&cl);
  197. bio = bch_bbio_alloc(b->c);
  198. bio->bi_rw = REQ_META|READ_SYNC;
  199. bio->bi_size = KEY_SIZE(&b->key) << 9;
  200. bio->bi_end_io = btree_node_read_endio;
  201. bio->bi_private = &cl;
  202. bch_bio_map(bio, b->sets[0].data);
  203. bch_submit_bbio(bio, b->c, &b->key, 0);
  204. closure_sync(&cl);
  205. if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
  206. set_btree_node_io_error(b);
  207. bch_bbio_free(bio, b->c);
  208. if (btree_node_io_error(b))
  209. goto err;
  210. bch_btree_node_read_done(b);
  211. spin_lock(&b->c->btree_read_time_lock);
  212. bch_time_stats_update(&b->c->btree_read_time, start_time);
  213. spin_unlock(&b->c->btree_read_time_lock);
  214. return;
  215. err:
  216. bch_cache_set_error(b->c, "io error reading bucket %lu",
  217. PTR_BUCKET_NR(b->c, &b->key, 0));
  218. }
  219. static void btree_complete_write(struct btree *b, struct btree_write *w)
  220. {
  221. if (w->prio_blocked &&
  222. !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
  223. wake_up_allocators(b->c);
  224. if (w->journal) {
  225. atomic_dec_bug(w->journal);
  226. __closure_wake_up(&b->c->journal.wait);
  227. }
  228. w->prio_blocked = 0;
  229. w->journal = NULL;
  230. }
  231. static void __btree_node_write_done(struct closure *cl)
  232. {
  233. struct btree *b = container_of(cl, struct btree, io.cl);
  234. struct btree_write *w = btree_prev_write(b);
  235. bch_bbio_free(b->bio, b->c);
  236. b->bio = NULL;
  237. btree_complete_write(b, w);
  238. if (btree_node_dirty(b))
  239. queue_delayed_work(btree_io_wq, &b->work,
  240. msecs_to_jiffies(30000));
  241. closure_return(cl);
  242. }
  243. static void btree_node_write_done(struct closure *cl)
  244. {
  245. struct btree *b = container_of(cl, struct btree, io.cl);
  246. struct bio_vec *bv;
  247. int n;
  248. __bio_for_each_segment(bv, b->bio, n, 0)
  249. __free_page(bv->bv_page);
  250. __btree_node_write_done(cl);
  251. }
  252. static void btree_node_write_endio(struct bio *bio, int error)
  253. {
  254. struct closure *cl = bio->bi_private;
  255. struct btree *b = container_of(cl, struct btree, io.cl);
  256. if (error)
  257. set_btree_node_io_error(b);
  258. bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
  259. closure_put(cl);
  260. }
  261. static void do_btree_node_write(struct btree *b)
  262. {
  263. struct closure *cl = &b->io.cl;
  264. struct bset *i = b->sets[b->nsets].data;
  265. BKEY_PADDED(key) k;
  266. i->version = BCACHE_BSET_VERSION;
  267. i->csum = btree_csum_set(b, i);
  268. BUG_ON(b->bio);
  269. b->bio = bch_bbio_alloc(b->c);
  270. b->bio->bi_end_io = btree_node_write_endio;
  271. b->bio->bi_private = &b->io.cl;
  272. b->bio->bi_rw = REQ_META|WRITE_SYNC;
  273. b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
  274. bch_bio_map(b->bio, i);
  275. bkey_copy(&k.key, &b->key);
  276. SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
  277. if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
  278. int j;
  279. struct bio_vec *bv;
  280. void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
  281. bio_for_each_segment(bv, b->bio, j)
  282. memcpy(page_address(bv->bv_page),
  283. base + j * PAGE_SIZE, PAGE_SIZE);
  284. bch_submit_bbio(b->bio, b->c, &k.key, 0);
  285. continue_at(cl, btree_node_write_done, NULL);
  286. } else {
  287. b->bio->bi_vcnt = 0;
  288. bch_bio_map(b->bio, i);
  289. bch_submit_bbio(b->bio, b->c, &k.key, 0);
  290. closure_sync(cl);
  291. __btree_node_write_done(cl);
  292. }
  293. }
  294. void bch_btree_node_write(struct btree *b, struct closure *parent)
  295. {
  296. struct bset *i = b->sets[b->nsets].data;
  297. trace_bcache_btree_write(b);
  298. BUG_ON(current->bio_list);
  299. BUG_ON(b->written >= btree_blocks(b));
  300. BUG_ON(b->written && !i->keys);
  301. BUG_ON(b->sets->data->seq != i->seq);
  302. bch_check_key_order(b, i);
  303. cancel_delayed_work(&b->work);
  304. /* If caller isn't waiting for write, parent refcount is cache set */
  305. closure_lock(&b->io, parent ?: &b->c->cl);
  306. clear_bit(BTREE_NODE_dirty, &b->flags);
  307. change_bit(BTREE_NODE_write_idx, &b->flags);
  308. do_btree_node_write(b);
  309. b->written += set_blocks(i, b->c);
  310. atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
  311. &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
  312. bch_btree_sort_lazy(b);
  313. if (b->written < btree_blocks(b))
  314. bch_bset_init_next(b);
  315. }
  316. static void btree_node_write_work(struct work_struct *w)
  317. {
  318. struct btree *b = container_of(to_delayed_work(w), struct btree, work);
  319. rw_lock(true, b, b->level);
  320. if (btree_node_dirty(b))
  321. bch_btree_node_write(b, NULL);
  322. rw_unlock(true, b);
  323. }
  324. static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
  325. {
  326. struct bset *i = b->sets[b->nsets].data;
  327. struct btree_write *w = btree_current_write(b);
  328. BUG_ON(!b->written);
  329. BUG_ON(!i->keys);
  330. if (!btree_node_dirty(b))
  331. queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
  332. set_btree_node_dirty(b);
  333. if (op && op->journal) {
  334. if (w->journal &&
  335. journal_pin_cmp(b->c, w, op)) {
  336. atomic_dec_bug(w->journal);
  337. w->journal = NULL;
  338. }
  339. if (!w->journal) {
  340. w->journal = op->journal;
  341. atomic_inc(w->journal);
  342. }
  343. }
  344. /* Force write if set is too big */
  345. if (set_bytes(i) > PAGE_SIZE - 48 &&
  346. !current->bio_list)
  347. bch_btree_node_write(b, NULL);
  348. }
  349. /*
  350. * Btree in memory cache - allocation/freeing
  351. * mca -> memory cache
  352. */
  353. static void mca_reinit(struct btree *b)
  354. {
  355. unsigned i;
  356. b->flags = 0;
  357. b->written = 0;
  358. b->nsets = 0;
  359. for (i = 0; i < MAX_BSETS; i++)
  360. b->sets[i].size = 0;
  361. /*
  362. * Second loop starts at 1 because b->sets[0]->data is the memory we
  363. * allocated
  364. */
  365. for (i = 1; i < MAX_BSETS; i++)
  366. b->sets[i].data = NULL;
  367. }
  368. #define mca_reserve(c) (((c->root && c->root->level) \
  369. ? c->root->level : 1) * 8 + 16)
  370. #define mca_can_free(c) \
  371. max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
  372. static void mca_data_free(struct btree *b)
  373. {
  374. struct bset_tree *t = b->sets;
  375. BUG_ON(!closure_is_unlocked(&b->io.cl));
  376. if (bset_prev_bytes(b) < PAGE_SIZE)
  377. kfree(t->prev);
  378. else
  379. free_pages((unsigned long) t->prev,
  380. get_order(bset_prev_bytes(b)));
  381. if (bset_tree_bytes(b) < PAGE_SIZE)
  382. kfree(t->tree);
  383. else
  384. free_pages((unsigned long) t->tree,
  385. get_order(bset_tree_bytes(b)));
  386. free_pages((unsigned long) t->data, b->page_order);
  387. t->prev = NULL;
  388. t->tree = NULL;
  389. t->data = NULL;
  390. list_move(&b->list, &b->c->btree_cache_freed);
  391. b->c->bucket_cache_used--;
  392. }
  393. static void mca_bucket_free(struct btree *b)
  394. {
  395. BUG_ON(btree_node_dirty(b));
  396. b->key.ptr[0] = 0;
  397. hlist_del_init_rcu(&b->hash);
  398. list_move(&b->list, &b->c->btree_cache_freeable);
  399. }
  400. static unsigned btree_order(struct bkey *k)
  401. {
  402. return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
  403. }
  404. static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
  405. {
  406. struct bset_tree *t = b->sets;
  407. BUG_ON(t->data);
  408. b->page_order = max_t(unsigned,
  409. ilog2(b->c->btree_pages),
  410. btree_order(k));
  411. t->data = (void *) __get_free_pages(gfp, b->page_order);
  412. if (!t->data)
  413. goto err;
  414. t->tree = bset_tree_bytes(b) < PAGE_SIZE
  415. ? kmalloc(bset_tree_bytes(b), gfp)
  416. : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
  417. if (!t->tree)
  418. goto err;
  419. t->prev = bset_prev_bytes(b) < PAGE_SIZE
  420. ? kmalloc(bset_prev_bytes(b), gfp)
  421. : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
  422. if (!t->prev)
  423. goto err;
  424. list_move(&b->list, &b->c->btree_cache);
  425. b->c->bucket_cache_used++;
  426. return;
  427. err:
  428. mca_data_free(b);
  429. }
  430. static struct btree *mca_bucket_alloc(struct cache_set *c,
  431. struct bkey *k, gfp_t gfp)
  432. {
  433. struct btree *b = kzalloc(sizeof(struct btree), gfp);
  434. if (!b)
  435. return NULL;
  436. init_rwsem(&b->lock);
  437. lockdep_set_novalidate_class(&b->lock);
  438. INIT_LIST_HEAD(&b->list);
  439. INIT_DELAYED_WORK(&b->work, btree_node_write_work);
  440. b->c = c;
  441. closure_init_unlocked(&b->io);
  442. mca_data_alloc(b, k, gfp);
  443. return b;
  444. }
  445. static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
  446. {
  447. lockdep_assert_held(&b->c->bucket_lock);
  448. if (!down_write_trylock(&b->lock))
  449. return -ENOMEM;
  450. if (b->page_order < min_order) {
  451. rw_unlock(true, b);
  452. return -ENOMEM;
  453. }
  454. BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
  455. if (cl && btree_node_dirty(b))
  456. bch_btree_node_write(b, NULL);
  457. if (cl)
  458. closure_wait_event_async(&b->io.wait, cl,
  459. atomic_read(&b->io.cl.remaining) == -1);
  460. if (btree_node_dirty(b) ||
  461. !closure_is_unlocked(&b->io.cl) ||
  462. work_pending(&b->work.work)) {
  463. rw_unlock(true, b);
  464. return -EAGAIN;
  465. }
  466. return 0;
  467. }
  468. static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
  469. {
  470. struct cache_set *c = container_of(shrink, struct cache_set, shrink);
  471. struct btree *b, *t;
  472. unsigned long i, nr = sc->nr_to_scan;
  473. if (c->shrinker_disabled)
  474. return 0;
  475. if (c->try_harder)
  476. return 0;
  477. /*
  478. * If nr == 0, we're supposed to return the number of items we have
  479. * cached. Not allowed to return -1.
  480. */
  481. if (!nr)
  482. return mca_can_free(c) * c->btree_pages;
  483. /* Return -1 if we can't do anything right now */
  484. if (sc->gfp_mask & __GFP_WAIT)
  485. mutex_lock(&c->bucket_lock);
  486. else if (!mutex_trylock(&c->bucket_lock))
  487. return -1;
  488. nr /= c->btree_pages;
  489. nr = min_t(unsigned long, nr, mca_can_free(c));
  490. i = 0;
  491. list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
  492. if (!nr)
  493. break;
  494. if (++i > 3 &&
  495. !mca_reap(b, NULL, 0)) {
  496. mca_data_free(b);
  497. rw_unlock(true, b);
  498. --nr;
  499. }
  500. }
  501. /*
  502. * Can happen right when we first start up, before we've read in any
  503. * btree nodes
  504. */
  505. if (list_empty(&c->btree_cache))
  506. goto out;
  507. for (i = 0; nr && i < c->bucket_cache_used; i++) {
  508. b = list_first_entry(&c->btree_cache, struct btree, list);
  509. list_rotate_left(&c->btree_cache);
  510. if (!b->accessed &&
  511. !mca_reap(b, NULL, 0)) {
  512. mca_bucket_free(b);
  513. mca_data_free(b);
  514. rw_unlock(true, b);
  515. --nr;
  516. } else
  517. b->accessed = 0;
  518. }
  519. out:
  520. nr = mca_can_free(c) * c->btree_pages;
  521. mutex_unlock(&c->bucket_lock);
  522. return nr;
  523. }
  524. void bch_btree_cache_free(struct cache_set *c)
  525. {
  526. struct btree *b;
  527. struct closure cl;
  528. closure_init_stack(&cl);
  529. if (c->shrink.list.next)
  530. unregister_shrinker(&c->shrink);
  531. mutex_lock(&c->bucket_lock);
  532. #ifdef CONFIG_BCACHE_DEBUG
  533. if (c->verify_data)
  534. list_move(&c->verify_data->list, &c->btree_cache);
  535. #endif
  536. list_splice(&c->btree_cache_freeable,
  537. &c->btree_cache);
  538. while (!list_empty(&c->btree_cache)) {
  539. b = list_first_entry(&c->btree_cache, struct btree, list);
  540. if (btree_node_dirty(b))
  541. btree_complete_write(b, btree_current_write(b));
  542. clear_bit(BTREE_NODE_dirty, &b->flags);
  543. mca_data_free(b);
  544. }
  545. while (!list_empty(&c->btree_cache_freed)) {
  546. b = list_first_entry(&c->btree_cache_freed,
  547. struct btree, list);
  548. list_del(&b->list);
  549. cancel_delayed_work_sync(&b->work);
  550. kfree(b);
  551. }
  552. mutex_unlock(&c->bucket_lock);
  553. }
  554. int bch_btree_cache_alloc(struct cache_set *c)
  555. {
  556. unsigned i;
  557. /* XXX: doesn't check for errors */
  558. closure_init_unlocked(&c->gc);
  559. for (i = 0; i < mca_reserve(c); i++)
  560. mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
  561. list_splice_init(&c->btree_cache,
  562. &c->btree_cache_freeable);
  563. #ifdef CONFIG_BCACHE_DEBUG
  564. mutex_init(&c->verify_lock);
  565. c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
  566. if (c->verify_data &&
  567. c->verify_data->sets[0].data)
  568. list_del_init(&c->verify_data->list);
  569. else
  570. c->verify_data = NULL;
  571. #endif
  572. c->shrink.shrink = bch_mca_shrink;
  573. c->shrink.seeks = 4;
  574. c->shrink.batch = c->btree_pages * 2;
  575. register_shrinker(&c->shrink);
  576. return 0;
  577. }
  578. /* Btree in memory cache - hash table */
  579. static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
  580. {
  581. return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
  582. }
  583. static struct btree *mca_find(struct cache_set *c, struct bkey *k)
  584. {
  585. struct btree *b;
  586. rcu_read_lock();
  587. hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
  588. if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
  589. goto out;
  590. b = NULL;
  591. out:
  592. rcu_read_unlock();
  593. return b;
  594. }
  595. static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
  596. int level, struct closure *cl)
  597. {
  598. int ret = -ENOMEM;
  599. struct btree *i;
  600. trace_bcache_btree_cache_cannibalize(c);
  601. if (!cl)
  602. return ERR_PTR(-ENOMEM);
  603. /*
  604. * Trying to free up some memory - i.e. reuse some btree nodes - may
  605. * require initiating IO to flush the dirty part of the node. If we're
  606. * running under generic_make_request(), that IO will never finish and
  607. * we would deadlock. Returning -EAGAIN causes the cache lookup code to
  608. * punt to workqueue and retry.
  609. */
  610. if (current->bio_list)
  611. return ERR_PTR(-EAGAIN);
  612. if (c->try_harder && c->try_harder != cl) {
  613. closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
  614. return ERR_PTR(-EAGAIN);
  615. }
  616. c->try_harder = cl;
  617. c->try_harder_start = local_clock();
  618. retry:
  619. list_for_each_entry_reverse(i, &c->btree_cache, list) {
  620. int r = mca_reap(i, cl, btree_order(k));
  621. if (!r)
  622. return i;
  623. if (r != -ENOMEM)
  624. ret = r;
  625. }
  626. if (ret == -EAGAIN &&
  627. closure_blocking(cl)) {
  628. mutex_unlock(&c->bucket_lock);
  629. closure_sync(cl);
  630. mutex_lock(&c->bucket_lock);
  631. goto retry;
  632. }
  633. return ERR_PTR(ret);
  634. }
  635. /*
  636. * We can only have one thread cannibalizing other cached btree nodes at a time,
  637. * or we'll deadlock. We use an open coded mutex to ensure that, which a
  638. * cannibalize_bucket() will take. This means every time we unlock the root of
  639. * the btree, we need to release this lock if we have it held.
  640. */
  641. void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
  642. {
  643. if (c->try_harder == cl) {
  644. bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
  645. c->try_harder = NULL;
  646. __closure_wake_up(&c->try_wait);
  647. }
  648. }
  649. static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
  650. int level, struct closure *cl)
  651. {
  652. struct btree *b;
  653. lockdep_assert_held(&c->bucket_lock);
  654. if (mca_find(c, k))
  655. return NULL;
  656. /* btree_free() doesn't free memory; it sticks the node on the end of
  657. * the list. Check if there's any freed nodes there:
  658. */
  659. list_for_each_entry(b, &c->btree_cache_freeable, list)
  660. if (!mca_reap(b, NULL, btree_order(k)))
  661. goto out;
  662. /* We never free struct btree itself, just the memory that holds the on
  663. * disk node. Check the freed list before allocating a new one:
  664. */
  665. list_for_each_entry(b, &c->btree_cache_freed, list)
  666. if (!mca_reap(b, NULL, 0)) {
  667. mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
  668. if (!b->sets[0].data)
  669. goto err;
  670. else
  671. goto out;
  672. }
  673. b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
  674. if (!b)
  675. goto err;
  676. BUG_ON(!down_write_trylock(&b->lock));
  677. if (!b->sets->data)
  678. goto err;
  679. out:
  680. BUG_ON(!closure_is_unlocked(&b->io.cl));
  681. bkey_copy(&b->key, k);
  682. list_move(&b->list, &c->btree_cache);
  683. hlist_del_init_rcu(&b->hash);
  684. hlist_add_head_rcu(&b->hash, mca_hash(c, k));
  685. lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
  686. b->level = level;
  687. mca_reinit(b);
  688. return b;
  689. err:
  690. if (b)
  691. rw_unlock(true, b);
  692. b = mca_cannibalize(c, k, level, cl);
  693. if (!IS_ERR(b))
  694. goto out;
  695. return b;
  696. }
  697. /**
  698. * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  699. * in from disk if necessary.
  700. *
  701. * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
  702. * if that closure is in non blocking mode, will return -EAGAIN.
  703. *
  704. * The btree node will have either a read or a write lock held, depending on
  705. * level and op->lock.
  706. */
  707. struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
  708. int level, struct btree_op *op)
  709. {
  710. int i = 0;
  711. bool write = level <= op->lock;
  712. struct btree *b;
  713. BUG_ON(level < 0);
  714. retry:
  715. b = mca_find(c, k);
  716. if (!b) {
  717. if (current->bio_list)
  718. return ERR_PTR(-EAGAIN);
  719. mutex_lock(&c->bucket_lock);
  720. b = mca_alloc(c, k, level, &op->cl);
  721. mutex_unlock(&c->bucket_lock);
  722. if (!b)
  723. goto retry;
  724. if (IS_ERR(b))
  725. return b;
  726. bch_btree_node_read(b);
  727. if (!write)
  728. downgrade_write(&b->lock);
  729. } else {
  730. rw_lock(write, b, level);
  731. if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
  732. rw_unlock(write, b);
  733. goto retry;
  734. }
  735. BUG_ON(b->level != level);
  736. }
  737. b->accessed = 1;
  738. for (; i <= b->nsets && b->sets[i].size; i++) {
  739. prefetch(b->sets[i].tree);
  740. prefetch(b->sets[i].data);
  741. }
  742. for (; i <= b->nsets; i++)
  743. prefetch(b->sets[i].data);
  744. if (btree_node_io_error(b)) {
  745. rw_unlock(write, b);
  746. return ERR_PTR(-EIO);
  747. }
  748. BUG_ON(!b->written);
  749. return b;
  750. }
  751. static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
  752. {
  753. struct btree *b;
  754. mutex_lock(&c->bucket_lock);
  755. b = mca_alloc(c, k, level, NULL);
  756. mutex_unlock(&c->bucket_lock);
  757. if (!IS_ERR_OR_NULL(b)) {
  758. bch_btree_node_read(b);
  759. rw_unlock(true, b);
  760. }
  761. }
  762. /* Btree alloc */
  763. static void btree_node_free(struct btree *b, struct btree_op *op)
  764. {
  765. unsigned i;
  766. trace_bcache_btree_node_free(b);
  767. /*
  768. * The BUG_ON() in btree_node_get() implies that we must have a write
  769. * lock on parent to free or even invalidate a node
  770. */
  771. BUG_ON(op->lock <= b->level);
  772. BUG_ON(b == b->c->root);
  773. if (btree_node_dirty(b))
  774. btree_complete_write(b, btree_current_write(b));
  775. clear_bit(BTREE_NODE_dirty, &b->flags);
  776. cancel_delayed_work(&b->work);
  777. mutex_lock(&b->c->bucket_lock);
  778. for (i = 0; i < KEY_PTRS(&b->key); i++) {
  779. BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
  780. bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
  781. PTR_BUCKET(b->c, &b->key, i));
  782. }
  783. bch_bucket_free(b->c, &b->key);
  784. mca_bucket_free(b);
  785. mutex_unlock(&b->c->bucket_lock);
  786. }
  787. struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
  788. struct closure *cl)
  789. {
  790. BKEY_PADDED(key) k;
  791. struct btree *b = ERR_PTR(-EAGAIN);
  792. mutex_lock(&c->bucket_lock);
  793. retry:
  794. if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
  795. goto err;
  796. SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
  797. b = mca_alloc(c, &k.key, level, cl);
  798. if (IS_ERR(b))
  799. goto err_free;
  800. if (!b) {
  801. cache_bug(c,
  802. "Tried to allocate bucket that was in btree cache");
  803. __bkey_put(c, &k.key);
  804. goto retry;
  805. }
  806. b->accessed = 1;
  807. bch_bset_init_next(b);
  808. mutex_unlock(&c->bucket_lock);
  809. trace_bcache_btree_node_alloc(b);
  810. return b;
  811. err_free:
  812. bch_bucket_free(c, &k.key);
  813. __bkey_put(c, &k.key);
  814. err:
  815. mutex_unlock(&c->bucket_lock);
  816. trace_bcache_btree_node_alloc_fail(b);
  817. return b;
  818. }
  819. static struct btree *btree_node_alloc_replacement(struct btree *b,
  820. struct closure *cl)
  821. {
  822. struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
  823. if (!IS_ERR_OR_NULL(n))
  824. bch_btree_sort_into(b, n);
  825. return n;
  826. }
  827. /* Garbage collection */
  828. uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
  829. {
  830. uint8_t stale = 0;
  831. unsigned i;
  832. struct bucket *g;
  833. /*
  834. * ptr_invalid() can't return true for the keys that mark btree nodes as
  835. * freed, but since ptr_bad() returns true we'll never actually use them
  836. * for anything and thus we don't want mark their pointers here
  837. */
  838. if (!bkey_cmp(k, &ZERO_KEY))
  839. return stale;
  840. for (i = 0; i < KEY_PTRS(k); i++) {
  841. if (!ptr_available(c, k, i))
  842. continue;
  843. g = PTR_BUCKET(c, k, i);
  844. if (gen_after(g->gc_gen, PTR_GEN(k, i)))
  845. g->gc_gen = PTR_GEN(k, i);
  846. if (ptr_stale(c, k, i)) {
  847. stale = max(stale, ptr_stale(c, k, i));
  848. continue;
  849. }
  850. cache_bug_on(GC_MARK(g) &&
  851. (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
  852. c, "inconsistent ptrs: mark = %llu, level = %i",
  853. GC_MARK(g), level);
  854. if (level)
  855. SET_GC_MARK(g, GC_MARK_METADATA);
  856. else if (KEY_DIRTY(k))
  857. SET_GC_MARK(g, GC_MARK_DIRTY);
  858. /* guard against overflow */
  859. SET_GC_SECTORS_USED(g, min_t(unsigned,
  860. GC_SECTORS_USED(g) + KEY_SIZE(k),
  861. (1 << 14) - 1));
  862. BUG_ON(!GC_SECTORS_USED(g));
  863. }
  864. return stale;
  865. }
  866. #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
  867. static int btree_gc_mark_node(struct btree *b, unsigned *keys,
  868. struct gc_stat *gc)
  869. {
  870. uint8_t stale = 0;
  871. unsigned last_dev = -1;
  872. struct bcache_device *d = NULL;
  873. struct bkey *k;
  874. struct btree_iter iter;
  875. struct bset_tree *t;
  876. gc->nodes++;
  877. for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
  878. if (last_dev != KEY_INODE(k)) {
  879. last_dev = KEY_INODE(k);
  880. d = KEY_INODE(k) < b->c->nr_uuids
  881. ? b->c->devices[last_dev]
  882. : NULL;
  883. }
  884. stale = max(stale, btree_mark_key(b, k));
  885. if (bch_ptr_bad(b, k))
  886. continue;
  887. *keys += bkey_u64s(k);
  888. gc->key_bytes += bkey_u64s(k);
  889. gc->nkeys++;
  890. gc->data += KEY_SIZE(k);
  891. if (KEY_DIRTY(k))
  892. gc->dirty += KEY_SIZE(k);
  893. }
  894. for (t = b->sets; t <= &b->sets[b->nsets]; t++)
  895. btree_bug_on(t->size &&
  896. bset_written(b, t) &&
  897. bkey_cmp(&b->key, &t->end) < 0,
  898. b, "found short btree key in gc");
  899. return stale;
  900. }
  901. static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
  902. struct btree_op *op)
  903. {
  904. /*
  905. * We block priorities from being written for the duration of garbage
  906. * collection, so we can't sleep in btree_alloc() ->
  907. * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
  908. * our closure.
  909. */
  910. struct btree *n = btree_node_alloc_replacement(b, NULL);
  911. if (!IS_ERR_OR_NULL(n)) {
  912. swap(b, n);
  913. __bkey_put(b->c, &b->key);
  914. memcpy(k->ptr, b->key.ptr,
  915. sizeof(uint64_t) * KEY_PTRS(&b->key));
  916. btree_node_free(n, op);
  917. up_write(&n->lock);
  918. }
  919. return b;
  920. }
  921. /*
  922. * Leaving this at 2 until we've got incremental garbage collection done; it
  923. * could be higher (and has been tested with 4) except that garbage collection
  924. * could take much longer, adversely affecting latency.
  925. */
  926. #define GC_MERGE_NODES 2U
  927. struct gc_merge_info {
  928. struct btree *b;
  929. struct bkey *k;
  930. unsigned keys;
  931. };
  932. static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
  933. struct gc_stat *gc, struct gc_merge_info *r)
  934. {
  935. unsigned nodes = 0, keys = 0, blocks;
  936. int i;
  937. while (nodes < GC_MERGE_NODES && r[nodes].b)
  938. keys += r[nodes++].keys;
  939. blocks = btree_default_blocks(b->c) * 2 / 3;
  940. if (nodes < 2 ||
  941. __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
  942. return;
  943. for (i = nodes - 1; i >= 0; --i) {
  944. if (r[i].b->written)
  945. r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
  946. if (r[i].b->written)
  947. return;
  948. }
  949. for (i = nodes - 1; i > 0; --i) {
  950. struct bset *n1 = r[i].b->sets->data;
  951. struct bset *n2 = r[i - 1].b->sets->data;
  952. struct bkey *k, *last = NULL;
  953. keys = 0;
  954. if (i == 1) {
  955. /*
  956. * Last node we're not getting rid of - we're getting
  957. * rid of the node at r[0]. Have to try and fit all of
  958. * the remaining keys into this node; we can't ensure
  959. * they will always fit due to rounding and variable
  960. * length keys (shouldn't be possible in practice,
  961. * though)
  962. */
  963. if (__set_blocks(n1, n1->keys + r->keys,
  964. b->c) > btree_blocks(r[i].b))
  965. return;
  966. keys = n2->keys;
  967. last = &r->b->key;
  968. } else
  969. for (k = n2->start;
  970. k < end(n2);
  971. k = bkey_next(k)) {
  972. if (__set_blocks(n1, n1->keys + keys +
  973. bkey_u64s(k), b->c) > blocks)
  974. break;
  975. last = k;
  976. keys += bkey_u64s(k);
  977. }
  978. BUG_ON(__set_blocks(n1, n1->keys + keys,
  979. b->c) > btree_blocks(r[i].b));
  980. if (last) {
  981. bkey_copy_key(&r[i].b->key, last);
  982. bkey_copy_key(r[i].k, last);
  983. }
  984. memcpy(end(n1),
  985. n2->start,
  986. (void *) node(n2, keys) - (void *) n2->start);
  987. n1->keys += keys;
  988. memmove(n2->start,
  989. node(n2, keys),
  990. (void *) end(n2) - (void *) node(n2, keys));
  991. n2->keys -= keys;
  992. r[i].keys = n1->keys;
  993. r[i - 1].keys = n2->keys;
  994. }
  995. btree_node_free(r->b, op);
  996. up_write(&r->b->lock);
  997. trace_bcache_btree_gc_coalesce(nodes);
  998. gc->nodes--;
  999. nodes--;
  1000. memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
  1001. memset(&r[nodes], 0, sizeof(struct gc_merge_info));
  1002. }
  1003. static int btree_gc_recurse(struct btree *b, struct btree_op *op,
  1004. struct closure *writes, struct gc_stat *gc)
  1005. {
  1006. void write(struct btree *r)
  1007. {
  1008. if (!r->written)
  1009. bch_btree_node_write(r, &op->cl);
  1010. else if (btree_node_dirty(r))
  1011. bch_btree_node_write(r, writes);
  1012. up_write(&r->lock);
  1013. }
  1014. int ret = 0, stale;
  1015. unsigned i;
  1016. struct gc_merge_info r[GC_MERGE_NODES];
  1017. memset(r, 0, sizeof(r));
  1018. while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
  1019. r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
  1020. if (IS_ERR(r->b)) {
  1021. ret = PTR_ERR(r->b);
  1022. break;
  1023. }
  1024. r->keys = 0;
  1025. stale = btree_gc_mark_node(r->b, &r->keys, gc);
  1026. if (!b->written &&
  1027. (r->b->level || stale > 10 ||
  1028. b->c->gc_always_rewrite))
  1029. r->b = btree_gc_alloc(r->b, r->k, op);
  1030. if (r->b->level)
  1031. ret = btree_gc_recurse(r->b, op, writes, gc);
  1032. if (ret) {
  1033. write(r->b);
  1034. break;
  1035. }
  1036. bkey_copy_key(&b->c->gc_done, r->k);
  1037. if (!b->written)
  1038. btree_gc_coalesce(b, op, gc, r);
  1039. if (r[GC_MERGE_NODES - 1].b)
  1040. write(r[GC_MERGE_NODES - 1].b);
  1041. memmove(&r[1], &r[0],
  1042. sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
  1043. /* When we've got incremental GC working, we'll want to do
  1044. * if (should_resched())
  1045. * return -EAGAIN;
  1046. */
  1047. cond_resched();
  1048. #if 0
  1049. if (need_resched()) {
  1050. ret = -EAGAIN;
  1051. break;
  1052. }
  1053. #endif
  1054. }
  1055. for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
  1056. write(r[i].b);
  1057. /* Might have freed some children, must remove their keys */
  1058. if (!b->written)
  1059. bch_btree_sort(b);
  1060. return ret;
  1061. }
  1062. static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
  1063. struct closure *writes, struct gc_stat *gc)
  1064. {
  1065. struct btree *n = NULL;
  1066. unsigned keys = 0;
  1067. int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
  1068. if (b->level || stale > 10)
  1069. n = btree_node_alloc_replacement(b, NULL);
  1070. if (!IS_ERR_OR_NULL(n))
  1071. swap(b, n);
  1072. if (b->level)
  1073. ret = btree_gc_recurse(b, op, writes, gc);
  1074. if (!b->written || btree_node_dirty(b)) {
  1075. bch_btree_node_write(b, n ? &op->cl : NULL);
  1076. }
  1077. if (!IS_ERR_OR_NULL(n)) {
  1078. closure_sync(&op->cl);
  1079. bch_btree_set_root(b);
  1080. btree_node_free(n, op);
  1081. rw_unlock(true, b);
  1082. }
  1083. return ret;
  1084. }
  1085. static void btree_gc_start(struct cache_set *c)
  1086. {
  1087. struct cache *ca;
  1088. struct bucket *b;
  1089. unsigned i;
  1090. if (!c->gc_mark_valid)
  1091. return;
  1092. mutex_lock(&c->bucket_lock);
  1093. c->gc_mark_valid = 0;
  1094. c->gc_done = ZERO_KEY;
  1095. for_each_cache(ca, c, i)
  1096. for_each_bucket(b, ca) {
  1097. b->gc_gen = b->gen;
  1098. if (!atomic_read(&b->pin))
  1099. SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
  1100. }
  1101. mutex_unlock(&c->bucket_lock);
  1102. }
  1103. size_t bch_btree_gc_finish(struct cache_set *c)
  1104. {
  1105. size_t available = 0;
  1106. struct bucket *b;
  1107. struct cache *ca;
  1108. unsigned i;
  1109. mutex_lock(&c->bucket_lock);
  1110. set_gc_sectors(c);
  1111. c->gc_mark_valid = 1;
  1112. c->need_gc = 0;
  1113. if (c->root)
  1114. for (i = 0; i < KEY_PTRS(&c->root->key); i++)
  1115. SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
  1116. GC_MARK_METADATA);
  1117. for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
  1118. SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
  1119. GC_MARK_METADATA);
  1120. for_each_cache(ca, c, i) {
  1121. uint64_t *i;
  1122. ca->invalidate_needs_gc = 0;
  1123. for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
  1124. SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
  1125. for (i = ca->prio_buckets;
  1126. i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
  1127. SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
  1128. for_each_bucket(b, ca) {
  1129. b->last_gc = b->gc_gen;
  1130. c->need_gc = max(c->need_gc, bucket_gc_gen(b));
  1131. if (!atomic_read(&b->pin) &&
  1132. GC_MARK(b) == GC_MARK_RECLAIMABLE) {
  1133. available++;
  1134. if (!GC_SECTORS_USED(b))
  1135. bch_bucket_add_unused(ca, b);
  1136. }
  1137. }
  1138. }
  1139. mutex_unlock(&c->bucket_lock);
  1140. return available;
  1141. }
  1142. static void bch_btree_gc(struct closure *cl)
  1143. {
  1144. struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
  1145. int ret;
  1146. unsigned long available;
  1147. struct gc_stat stats;
  1148. struct closure writes;
  1149. struct btree_op op;
  1150. uint64_t start_time = local_clock();
  1151. trace_bcache_gc_start(c);
  1152. memset(&stats, 0, sizeof(struct gc_stat));
  1153. closure_init_stack(&writes);
  1154. bch_btree_op_init_stack(&op);
  1155. op.lock = SHRT_MAX;
  1156. btree_gc_start(c);
  1157. atomic_inc(&c->prio_blocked);
  1158. ret = btree_root(gc_root, c, &op, &writes, &stats);
  1159. closure_sync(&op.cl);
  1160. closure_sync(&writes);
  1161. if (ret) {
  1162. pr_warn("gc failed!");
  1163. continue_at(cl, bch_btree_gc, bch_gc_wq);
  1164. }
  1165. /* Possibly wait for new UUIDs or whatever to hit disk */
  1166. bch_journal_meta(c, &op.cl);
  1167. closure_sync(&op.cl);
  1168. available = bch_btree_gc_finish(c);
  1169. atomic_dec(&c->prio_blocked);
  1170. wake_up_allocators(c);
  1171. bch_time_stats_update(&c->btree_gc_time, start_time);
  1172. stats.key_bytes *= sizeof(uint64_t);
  1173. stats.dirty <<= 9;
  1174. stats.data <<= 9;
  1175. stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
  1176. memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
  1177. trace_bcache_gc_end(c);
  1178. continue_at(cl, bch_moving_gc, bch_gc_wq);
  1179. }
  1180. void bch_queue_gc(struct cache_set *c)
  1181. {
  1182. closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
  1183. }
  1184. /* Initial partial gc */
  1185. static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
  1186. unsigned long **seen)
  1187. {
  1188. int ret;
  1189. unsigned i;
  1190. struct bkey *k;
  1191. struct bucket *g;
  1192. struct btree_iter iter;
  1193. for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
  1194. for (i = 0; i < KEY_PTRS(k); i++) {
  1195. if (!ptr_available(b->c, k, i))
  1196. continue;
  1197. g = PTR_BUCKET(b->c, k, i);
  1198. if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
  1199. seen[PTR_DEV(k, i)]) ||
  1200. !ptr_stale(b->c, k, i)) {
  1201. g->gen = PTR_GEN(k, i);
  1202. if (b->level)
  1203. g->prio = BTREE_PRIO;
  1204. else if (g->prio == BTREE_PRIO)
  1205. g->prio = INITIAL_PRIO;
  1206. }
  1207. }
  1208. btree_mark_key(b, k);
  1209. }
  1210. if (b->level) {
  1211. k = bch_next_recurse_key(b, &ZERO_KEY);
  1212. while (k) {
  1213. struct bkey *p = bch_next_recurse_key(b, k);
  1214. if (p)
  1215. btree_node_prefetch(b->c, p, b->level - 1);
  1216. ret = btree(check_recurse, k, b, op, seen);
  1217. if (ret)
  1218. return ret;
  1219. k = p;
  1220. }
  1221. }
  1222. return 0;
  1223. }
  1224. int bch_btree_check(struct cache_set *c, struct btree_op *op)
  1225. {
  1226. int ret = -ENOMEM;
  1227. unsigned i;
  1228. unsigned long *seen[MAX_CACHES_PER_SET];
  1229. memset(seen, 0, sizeof(seen));
  1230. for (i = 0; c->cache[i]; i++) {
  1231. size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
  1232. seen[i] = kmalloc(n, GFP_KERNEL);
  1233. if (!seen[i])
  1234. goto err;
  1235. /* Disables the seen array until prio_read() uses it too */
  1236. memset(seen[i], 0xFF, n);
  1237. }
  1238. ret = btree_root(check_recurse, c, op, seen);
  1239. err:
  1240. for (i = 0; i < MAX_CACHES_PER_SET; i++)
  1241. kfree(seen[i]);
  1242. return ret;
  1243. }
  1244. /* Btree insertion */
  1245. static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
  1246. {
  1247. struct bset *i = b->sets[b->nsets].data;
  1248. memmove((uint64_t *) where + bkey_u64s(insert),
  1249. where,
  1250. (void *) end(i) - (void *) where);
  1251. i->keys += bkey_u64s(insert);
  1252. bkey_copy(where, insert);
  1253. bch_bset_fix_lookup_table(b, where);
  1254. }
  1255. static bool fix_overlapping_extents(struct btree *b,
  1256. struct bkey *insert,
  1257. struct btree_iter *iter,
  1258. struct btree_op *op)
  1259. {
  1260. void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
  1261. {
  1262. if (KEY_DIRTY(k))
  1263. bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
  1264. offset, -sectors);
  1265. }
  1266. uint64_t old_offset;
  1267. unsigned old_size, sectors_found = 0;
  1268. while (1) {
  1269. struct bkey *k = bch_btree_iter_next(iter);
  1270. if (!k ||
  1271. bkey_cmp(&START_KEY(k), insert) >= 0)
  1272. break;
  1273. if (bkey_cmp(k, &START_KEY(insert)) <= 0)
  1274. continue;
  1275. old_offset = KEY_START(k);
  1276. old_size = KEY_SIZE(k);
  1277. /*
  1278. * We might overlap with 0 size extents; we can't skip these
  1279. * because if they're in the set we're inserting to we have to
  1280. * adjust them so they don't overlap with the key we're
  1281. * inserting. But we don't want to check them for BTREE_REPLACE
  1282. * operations.
  1283. */
  1284. if (op->type == BTREE_REPLACE &&
  1285. KEY_SIZE(k)) {
  1286. /*
  1287. * k might have been split since we inserted/found the
  1288. * key we're replacing
  1289. */
  1290. unsigned i;
  1291. uint64_t offset = KEY_START(k) -
  1292. KEY_START(&op->replace);
  1293. /* But it must be a subset of the replace key */
  1294. if (KEY_START(k) < KEY_START(&op->replace) ||
  1295. KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
  1296. goto check_failed;
  1297. /* We didn't find a key that we were supposed to */
  1298. if (KEY_START(k) > KEY_START(insert) + sectors_found)
  1299. goto check_failed;
  1300. if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
  1301. goto check_failed;
  1302. /* skip past gen */
  1303. offset <<= 8;
  1304. BUG_ON(!KEY_PTRS(&op->replace));
  1305. for (i = 0; i < KEY_PTRS(&op->replace); i++)
  1306. if (k->ptr[i] != op->replace.ptr[i] + offset)
  1307. goto check_failed;
  1308. sectors_found = KEY_OFFSET(k) - KEY_START(insert);
  1309. }
  1310. if (bkey_cmp(insert, k) < 0 &&
  1311. bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
  1312. /*
  1313. * We overlapped in the middle of an existing key: that
  1314. * means we have to split the old key. But we have to do
  1315. * slightly different things depending on whether the
  1316. * old key has been written out yet.
  1317. */
  1318. struct bkey *top;
  1319. subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
  1320. if (bkey_written(b, k)) {
  1321. /*
  1322. * We insert a new key to cover the top of the
  1323. * old key, and the old key is modified in place
  1324. * to represent the bottom split.
  1325. *
  1326. * It's completely arbitrary whether the new key
  1327. * is the top or the bottom, but it has to match
  1328. * up with what btree_sort_fixup() does - it
  1329. * doesn't check for this kind of overlap, it
  1330. * depends on us inserting a new key for the top
  1331. * here.
  1332. */
  1333. top = bch_bset_search(b, &b->sets[b->nsets],
  1334. insert);
  1335. shift_keys(b, top, k);
  1336. } else {
  1337. BKEY_PADDED(key) temp;
  1338. bkey_copy(&temp.key, k);
  1339. shift_keys(b, k, &temp.key);
  1340. top = bkey_next(k);
  1341. }
  1342. bch_cut_front(insert, top);
  1343. bch_cut_back(&START_KEY(insert), k);
  1344. bch_bset_fix_invalidated_key(b, k);
  1345. return false;
  1346. }
  1347. if (bkey_cmp(insert, k) < 0) {
  1348. bch_cut_front(insert, k);
  1349. } else {
  1350. if (bkey_written(b, k) &&
  1351. bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
  1352. /*
  1353. * Completely overwrote, so we don't have to
  1354. * invalidate the binary search tree
  1355. */
  1356. bch_cut_front(k, k);
  1357. } else {
  1358. __bch_cut_back(&START_KEY(insert), k);
  1359. bch_bset_fix_invalidated_key(b, k);
  1360. }
  1361. }
  1362. subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
  1363. }
  1364. check_failed:
  1365. if (op->type == BTREE_REPLACE) {
  1366. if (!sectors_found) {
  1367. op->insert_collision = true;
  1368. return true;
  1369. } else if (sectors_found < KEY_SIZE(insert)) {
  1370. SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
  1371. (KEY_SIZE(insert) - sectors_found));
  1372. SET_KEY_SIZE(insert, sectors_found);
  1373. }
  1374. }
  1375. return false;
  1376. }
  1377. static bool btree_insert_key(struct btree *b, struct btree_op *op,
  1378. struct bkey *k)
  1379. {
  1380. struct bset *i = b->sets[b->nsets].data;
  1381. struct bkey *m, *prev;
  1382. unsigned status = BTREE_INSERT_STATUS_INSERT;
  1383. BUG_ON(bkey_cmp(k, &b->key) > 0);
  1384. BUG_ON(b->level && !KEY_PTRS(k));
  1385. BUG_ON(!b->level && !KEY_OFFSET(k));
  1386. if (!b->level) {
  1387. struct btree_iter iter;
  1388. struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
  1389. /*
  1390. * bset_search() returns the first key that is strictly greater
  1391. * than the search key - but for back merging, we want to find
  1392. * the first key that is greater than or equal to KEY_START(k) -
  1393. * unless KEY_START(k) is 0.
  1394. */
  1395. if (KEY_OFFSET(&search))
  1396. SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
  1397. prev = NULL;
  1398. m = bch_btree_iter_init(b, &iter, &search);
  1399. if (fix_overlapping_extents(b, k, &iter, op))
  1400. return false;
  1401. while (m != end(i) &&
  1402. bkey_cmp(k, &START_KEY(m)) > 0)
  1403. prev = m, m = bkey_next(m);
  1404. if (key_merging_disabled(b->c))
  1405. goto insert;
  1406. /* prev is in the tree, if we merge we're done */
  1407. status = BTREE_INSERT_STATUS_BACK_MERGE;
  1408. if (prev &&
  1409. bch_bkey_try_merge(b, prev, k))
  1410. goto merged;
  1411. status = BTREE_INSERT_STATUS_OVERWROTE;
  1412. if (m != end(i) &&
  1413. KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
  1414. goto copy;
  1415. status = BTREE_INSERT_STATUS_FRONT_MERGE;
  1416. if (m != end(i) &&
  1417. bch_bkey_try_merge(b, k, m))
  1418. goto copy;
  1419. } else
  1420. m = bch_bset_search(b, &b->sets[b->nsets], k);
  1421. insert: shift_keys(b, m, k);
  1422. copy: bkey_copy(m, k);
  1423. merged:
  1424. if (KEY_DIRTY(k))
  1425. bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
  1426. KEY_START(k), KEY_SIZE(k));
  1427. bch_check_keys(b, "%u for %s", status, op_type(op));
  1428. if (b->level && !KEY_OFFSET(k))
  1429. btree_current_write(b)->prio_blocked++;
  1430. trace_bcache_btree_insert_key(b, k, op->type, status);
  1431. return true;
  1432. }
  1433. bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
  1434. {
  1435. bool ret = false;
  1436. struct bkey *k;
  1437. unsigned oldsize = bch_count_data(b);
  1438. while ((k = bch_keylist_pop(&op->keys))) {
  1439. bkey_put(b->c, k, b->level);
  1440. ret |= btree_insert_key(b, op, k);
  1441. }
  1442. BUG_ON(bch_count_data(b) < oldsize);
  1443. return ret;
  1444. }
  1445. bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
  1446. struct bio *bio)
  1447. {
  1448. bool ret = false;
  1449. uint64_t btree_ptr = b->key.ptr[0];
  1450. unsigned long seq = b->seq;
  1451. BKEY_PADDED(k) tmp;
  1452. rw_unlock(false, b);
  1453. rw_lock(true, b, b->level);
  1454. if (b->key.ptr[0] != btree_ptr ||
  1455. b->seq != seq + 1 ||
  1456. should_split(b))
  1457. goto out;
  1458. op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
  1459. SET_KEY_PTRS(&op->replace, 1);
  1460. get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
  1461. SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
  1462. bkey_copy(&tmp.k, &op->replace);
  1463. BUG_ON(op->type != BTREE_INSERT);
  1464. BUG_ON(!btree_insert_key(b, op, &tmp.k));
  1465. ret = true;
  1466. out:
  1467. downgrade_write(&b->lock);
  1468. return ret;
  1469. }
  1470. static int btree_split(struct btree *b, struct btree_op *op)
  1471. {
  1472. bool split, root = b == b->c->root;
  1473. struct btree *n1, *n2 = NULL, *n3 = NULL;
  1474. uint64_t start_time = local_clock();
  1475. if (b->level)
  1476. set_closure_blocking(&op->cl);
  1477. n1 = btree_node_alloc_replacement(b, &op->cl);
  1478. if (IS_ERR(n1))
  1479. goto err;
  1480. split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
  1481. if (split) {
  1482. unsigned keys = 0;
  1483. trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
  1484. n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
  1485. if (IS_ERR(n2))
  1486. goto err_free1;
  1487. if (root) {
  1488. n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
  1489. if (IS_ERR(n3))
  1490. goto err_free2;
  1491. }
  1492. bch_btree_insert_keys(n1, op);
  1493. /* Has to be a linear search because we don't have an auxiliary
  1494. * search tree yet
  1495. */
  1496. while (keys < (n1->sets[0].data->keys * 3) / 5)
  1497. keys += bkey_u64s(node(n1->sets[0].data, keys));
  1498. bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
  1499. keys += bkey_u64s(node(n1->sets[0].data, keys));
  1500. n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
  1501. n1->sets[0].data->keys = keys;
  1502. memcpy(n2->sets[0].data->start,
  1503. end(n1->sets[0].data),
  1504. n2->sets[0].data->keys * sizeof(uint64_t));
  1505. bkey_copy_key(&n2->key, &b->key);
  1506. bch_keylist_add(&op->keys, &n2->key);
  1507. bch_btree_node_write(n2, &op->cl);
  1508. rw_unlock(true, n2);
  1509. } else {
  1510. trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
  1511. bch_btree_insert_keys(n1, op);
  1512. }
  1513. bch_keylist_add(&op->keys, &n1->key);
  1514. bch_btree_node_write(n1, &op->cl);
  1515. if (n3) {
  1516. bkey_copy_key(&n3->key, &MAX_KEY);
  1517. bch_btree_insert_keys(n3, op);
  1518. bch_btree_node_write(n3, &op->cl);
  1519. closure_sync(&op->cl);
  1520. bch_btree_set_root(n3);
  1521. rw_unlock(true, n3);
  1522. } else if (root) {
  1523. op->keys.top = op->keys.bottom;
  1524. closure_sync(&op->cl);
  1525. bch_btree_set_root(n1);
  1526. } else {
  1527. unsigned i;
  1528. bkey_copy(op->keys.top, &b->key);
  1529. bkey_copy_key(op->keys.top, &ZERO_KEY);
  1530. for (i = 0; i < KEY_PTRS(&b->key); i++) {
  1531. uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
  1532. SET_PTR_GEN(op->keys.top, i, g);
  1533. }
  1534. bch_keylist_push(&op->keys);
  1535. closure_sync(&op->cl);
  1536. atomic_inc(&b->c->prio_blocked);
  1537. }
  1538. rw_unlock(true, n1);
  1539. btree_node_free(b, op);
  1540. bch_time_stats_update(&b->c->btree_split_time, start_time);
  1541. return 0;
  1542. err_free2:
  1543. __bkey_put(n2->c, &n2->key);
  1544. btree_node_free(n2, op);
  1545. rw_unlock(true, n2);
  1546. err_free1:
  1547. __bkey_put(n1->c, &n1->key);
  1548. btree_node_free(n1, op);
  1549. rw_unlock(true, n1);
  1550. err:
  1551. if (n3 == ERR_PTR(-EAGAIN) ||
  1552. n2 == ERR_PTR(-EAGAIN) ||
  1553. n1 == ERR_PTR(-EAGAIN))
  1554. return -EAGAIN;
  1555. pr_warn("couldn't split");
  1556. return -ENOMEM;
  1557. }
  1558. static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
  1559. struct keylist *stack_keys)
  1560. {
  1561. if (b->level) {
  1562. int ret;
  1563. struct bkey *insert = op->keys.bottom;
  1564. struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
  1565. if (!k) {
  1566. btree_bug(b, "no key to recurse on at level %i/%i",
  1567. b->level, b->c->root->level);
  1568. op->keys.top = op->keys.bottom;
  1569. return -EIO;
  1570. }
  1571. if (bkey_cmp(insert, k) > 0) {
  1572. unsigned i;
  1573. if (op->type == BTREE_REPLACE) {
  1574. __bkey_put(b->c, insert);
  1575. op->keys.top = op->keys.bottom;
  1576. op->insert_collision = true;
  1577. return 0;
  1578. }
  1579. for (i = 0; i < KEY_PTRS(insert); i++)
  1580. atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
  1581. bkey_copy(stack_keys->top, insert);
  1582. bch_cut_back(k, insert);
  1583. bch_cut_front(k, stack_keys->top);
  1584. bch_keylist_push(stack_keys);
  1585. }
  1586. ret = btree(insert_recurse, k, b, op, stack_keys);
  1587. if (ret)
  1588. return ret;
  1589. }
  1590. if (!bch_keylist_empty(&op->keys)) {
  1591. if (should_split(b)) {
  1592. if (op->lock <= b->c->root->level) {
  1593. BUG_ON(b->level);
  1594. op->lock = b->c->root->level + 1;
  1595. return -EINTR;
  1596. }
  1597. return btree_split(b, op);
  1598. }
  1599. BUG_ON(write_block(b) != b->sets[b->nsets].data);
  1600. if (bch_btree_insert_keys(b, op)) {
  1601. if (!b->level)
  1602. bch_btree_leaf_dirty(b, op);
  1603. else
  1604. bch_btree_node_write(b, &op->cl);
  1605. }
  1606. }
  1607. return 0;
  1608. }
  1609. int bch_btree_insert(struct btree_op *op, struct cache_set *c)
  1610. {
  1611. int ret = 0;
  1612. struct keylist stack_keys;
  1613. /*
  1614. * Don't want to block with the btree locked unless we have to,
  1615. * otherwise we get deadlocks with try_harder and between split/gc
  1616. */
  1617. clear_closure_blocking(&op->cl);
  1618. BUG_ON(bch_keylist_empty(&op->keys));
  1619. bch_keylist_copy(&stack_keys, &op->keys);
  1620. bch_keylist_init(&op->keys);
  1621. while (!bch_keylist_empty(&stack_keys) ||
  1622. !bch_keylist_empty(&op->keys)) {
  1623. if (bch_keylist_empty(&op->keys)) {
  1624. bch_keylist_add(&op->keys,
  1625. bch_keylist_pop(&stack_keys));
  1626. op->lock = 0;
  1627. }
  1628. ret = btree_root(insert_recurse, c, op, &stack_keys);
  1629. if (ret == -EAGAIN) {
  1630. ret = 0;
  1631. closure_sync(&op->cl);
  1632. } else if (ret) {
  1633. struct bkey *k;
  1634. pr_err("error %i trying to insert key for %s",
  1635. ret, op_type(op));
  1636. while ((k = bch_keylist_pop(&stack_keys) ?:
  1637. bch_keylist_pop(&op->keys)))
  1638. bkey_put(c, k, 0);
  1639. }
  1640. }
  1641. bch_keylist_free(&stack_keys);
  1642. if (op->journal)
  1643. atomic_dec_bug(op->journal);
  1644. op->journal = NULL;
  1645. return ret;
  1646. }
  1647. void bch_btree_set_root(struct btree *b)
  1648. {
  1649. unsigned i;
  1650. trace_bcache_btree_set_root(b);
  1651. BUG_ON(!b->written);
  1652. for (i = 0; i < KEY_PTRS(&b->key); i++)
  1653. BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
  1654. mutex_lock(&b->c->bucket_lock);
  1655. list_del_init(&b->list);
  1656. mutex_unlock(&b->c->bucket_lock);
  1657. b->c->root = b;
  1658. __bkey_put(b->c, &b->key);
  1659. bch_journal_meta(b->c, NULL);
  1660. }
  1661. /* Cache lookup */
  1662. static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
  1663. struct bkey *k)
  1664. {
  1665. struct search *s = container_of(op, struct search, op);
  1666. struct bio *bio = &s->bio.bio;
  1667. int ret = 0;
  1668. while (!ret &&
  1669. !op->lookup_done) {
  1670. unsigned sectors = INT_MAX;
  1671. if (KEY_INODE(k) == op->inode) {
  1672. if (KEY_START(k) <= bio->bi_sector)
  1673. break;
  1674. sectors = min_t(uint64_t, sectors,
  1675. KEY_START(k) - bio->bi_sector);
  1676. }
  1677. ret = s->d->cache_miss(b, s, bio, sectors);
  1678. }
  1679. return ret;
  1680. }
  1681. /*
  1682. * Read from a single key, handling the initial cache miss if the key starts in
  1683. * the middle of the bio
  1684. */
  1685. static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
  1686. struct bkey *k)
  1687. {
  1688. struct search *s = container_of(op, struct search, op);
  1689. struct bio *bio = &s->bio.bio;
  1690. unsigned ptr;
  1691. struct bio *n;
  1692. int ret = submit_partial_cache_miss(b, op, k);
  1693. if (ret || op->lookup_done)
  1694. return ret;
  1695. /* XXX: figure out best pointer - for multiple cache devices */
  1696. ptr = 0;
  1697. PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
  1698. while (!op->lookup_done &&
  1699. KEY_INODE(k) == op->inode &&
  1700. bio->bi_sector < KEY_OFFSET(k)) {
  1701. struct bkey *bio_key;
  1702. sector_t sector = PTR_OFFSET(k, ptr) +
  1703. (bio->bi_sector - KEY_START(k));
  1704. unsigned sectors = min_t(uint64_t, INT_MAX,
  1705. KEY_OFFSET(k) - bio->bi_sector);
  1706. n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
  1707. if (!n)
  1708. return -EAGAIN;
  1709. if (n == bio)
  1710. op->lookup_done = true;
  1711. bio_key = &container_of(n, struct bbio, bio)->key;
  1712. /*
  1713. * The bucket we're reading from might be reused while our bio
  1714. * is in flight, and we could then end up reading the wrong
  1715. * data.
  1716. *
  1717. * We guard against this by checking (in cache_read_endio()) if
  1718. * the pointer is stale again; if so, we treat it as an error
  1719. * and reread from the backing device (but we don't pass that
  1720. * error up anywhere).
  1721. */
  1722. bch_bkey_copy_single_ptr(bio_key, k, ptr);
  1723. SET_PTR_OFFSET(bio_key, 0, sector);
  1724. n->bi_end_io = bch_cache_read_endio;
  1725. n->bi_private = &s->cl;
  1726. __bch_submit_bbio(n, b->c);
  1727. }
  1728. return 0;
  1729. }
  1730. int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
  1731. {
  1732. struct search *s = container_of(op, struct search, op);
  1733. struct bio *bio = &s->bio.bio;
  1734. int ret = 0;
  1735. struct bkey *k;
  1736. struct btree_iter iter;
  1737. bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
  1738. do {
  1739. k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
  1740. if (!k) {
  1741. /*
  1742. * b->key would be exactly what we want, except that
  1743. * pointers to btree nodes have nonzero size - we
  1744. * wouldn't go far enough
  1745. */
  1746. ret = submit_partial_cache_miss(b, op,
  1747. &KEY(KEY_INODE(&b->key),
  1748. KEY_OFFSET(&b->key), 0));
  1749. break;
  1750. }
  1751. ret = b->level
  1752. ? btree(search_recurse, k, b, op)
  1753. : submit_partial_cache_hit(b, op, k);
  1754. } while (!ret &&
  1755. !op->lookup_done);
  1756. return ret;
  1757. }
  1758. /* Keybuf code */
  1759. static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
  1760. {
  1761. /* Overlapping keys compare equal */
  1762. if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
  1763. return -1;
  1764. if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
  1765. return 1;
  1766. return 0;
  1767. }
  1768. static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
  1769. struct keybuf_key *r)
  1770. {
  1771. return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
  1772. }
  1773. static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
  1774. struct keybuf *buf, struct bkey *end,
  1775. keybuf_pred_fn *pred)
  1776. {
  1777. struct btree_iter iter;
  1778. bch_btree_iter_init(b, &iter, &buf->last_scanned);
  1779. while (!array_freelist_empty(&buf->freelist)) {
  1780. struct bkey *k = bch_btree_iter_next_filter(&iter, b,
  1781. bch_ptr_bad);
  1782. if (!b->level) {
  1783. if (!k) {
  1784. buf->last_scanned = b->key;
  1785. break;
  1786. }
  1787. buf->last_scanned = *k;
  1788. if (bkey_cmp(&buf->last_scanned, end) >= 0)
  1789. break;
  1790. if (pred(buf, k)) {
  1791. struct keybuf_key *w;
  1792. spin_lock(&buf->lock);
  1793. w = array_alloc(&buf->freelist);
  1794. w->private = NULL;
  1795. bkey_copy(&w->key, k);
  1796. if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
  1797. array_free(&buf->freelist, w);
  1798. spin_unlock(&buf->lock);
  1799. }
  1800. } else {
  1801. if (!k)
  1802. break;
  1803. btree(refill_keybuf, k, b, op, buf, end, pred);
  1804. /*
  1805. * Might get an error here, but can't really do anything
  1806. * and it'll get logged elsewhere. Just read what we
  1807. * can.
  1808. */
  1809. if (bkey_cmp(&buf->last_scanned, end) >= 0)
  1810. break;
  1811. cond_resched();
  1812. }
  1813. }
  1814. return 0;
  1815. }
  1816. void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
  1817. struct bkey *end, keybuf_pred_fn *pred)
  1818. {
  1819. struct bkey start = buf->last_scanned;
  1820. struct btree_op op;
  1821. bch_btree_op_init_stack(&op);
  1822. cond_resched();
  1823. btree_root(refill_keybuf, c, &op, buf, end, pred);
  1824. closure_sync(&op.cl);
  1825. pr_debug("found %s keys from %llu:%llu to %llu:%llu",
  1826. RB_EMPTY_ROOT(&buf->keys) ? "no" :
  1827. array_freelist_empty(&buf->freelist) ? "some" : "a few",
  1828. KEY_INODE(&start), KEY_OFFSET(&start),
  1829. KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
  1830. spin_lock(&buf->lock);
  1831. if (!RB_EMPTY_ROOT(&buf->keys)) {
  1832. struct keybuf_key *w;
  1833. w = RB_FIRST(&buf->keys, struct keybuf_key, node);
  1834. buf->start = START_KEY(&w->key);
  1835. w = RB_LAST(&buf->keys, struct keybuf_key, node);
  1836. buf->end = w->key;
  1837. } else {
  1838. buf->start = MAX_KEY;
  1839. buf->end = MAX_KEY;
  1840. }
  1841. spin_unlock(&buf->lock);
  1842. }
  1843. static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
  1844. {
  1845. rb_erase(&w->node, &buf->keys);
  1846. array_free(&buf->freelist, w);
  1847. }
  1848. void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
  1849. {
  1850. spin_lock(&buf->lock);
  1851. __bch_keybuf_del(buf, w);
  1852. spin_unlock(&buf->lock);
  1853. }
  1854. bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
  1855. struct bkey *end)
  1856. {
  1857. bool ret = false;
  1858. struct keybuf_key *p, *w, s;
  1859. s.key = *start;
  1860. if (bkey_cmp(end, &buf->start) <= 0 ||
  1861. bkey_cmp(start, &buf->end) >= 0)
  1862. return false;
  1863. spin_lock(&buf->lock);
  1864. w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
  1865. while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
  1866. p = w;
  1867. w = RB_NEXT(w, node);
  1868. if (p->private)
  1869. ret = true;
  1870. else
  1871. __bch_keybuf_del(buf, p);
  1872. }
  1873. spin_unlock(&buf->lock);
  1874. return ret;
  1875. }
  1876. struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
  1877. {
  1878. struct keybuf_key *w;
  1879. spin_lock(&buf->lock);
  1880. w = RB_FIRST(&buf->keys, struct keybuf_key, node);
  1881. while (w && w->private)
  1882. w = RB_NEXT(w, node);
  1883. if (w)
  1884. w->private = ERR_PTR(-EINTR);
  1885. spin_unlock(&buf->lock);
  1886. return w;
  1887. }
  1888. struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
  1889. struct keybuf *buf,
  1890. struct bkey *end,
  1891. keybuf_pred_fn *pred)
  1892. {
  1893. struct keybuf_key *ret;
  1894. while (1) {
  1895. ret = bch_keybuf_next(buf);
  1896. if (ret)
  1897. break;
  1898. if (bkey_cmp(&buf->last_scanned, end) >= 0) {
  1899. pr_debug("scan finished");
  1900. break;
  1901. }
  1902. bch_refill_keybuf(c, buf, end, pred);
  1903. }
  1904. return ret;
  1905. }
  1906. void bch_keybuf_init(struct keybuf *buf)
  1907. {
  1908. buf->last_scanned = MAX_KEY;
  1909. buf->keys = RB_ROOT;
  1910. spin_lock_init(&buf->lock);
  1911. array_allocator_init(&buf->freelist);
  1912. }
  1913. void bch_btree_exit(void)
  1914. {
  1915. if (btree_io_wq)
  1916. destroy_workqueue(btree_io_wq);
  1917. if (bch_gc_wq)
  1918. destroy_workqueue(bch_gc_wq);
  1919. }
  1920. int __init bch_btree_init(void)
  1921. {
  1922. if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
  1923. !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
  1924. return -ENOMEM;
  1925. return 0;
  1926. }