xfs_buf.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869
  1. /*
  2. * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  3. * All Rights Reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "xfs.h"
  19. #include <linux/stddef.h>
  20. #include <linux/errno.h>
  21. #include <linux/gfp.h>
  22. #include <linux/pagemap.h>
  23. #include <linux/init.h>
  24. #include <linux/vmalloc.h>
  25. #include <linux/bio.h>
  26. #include <linux/sysctl.h>
  27. #include <linux/proc_fs.h>
  28. #include <linux/workqueue.h>
  29. #include <linux/percpu.h>
  30. #include <linux/blkdev.h>
  31. #include <linux/hash.h>
  32. #include <linux/kthread.h>
  33. #include <linux/migrate.h>
  34. #include <linux/backing-dev.h>
  35. #include <linux/freezer.h>
  36. #include "xfs_sb.h"
  37. #include "xfs_trans_resv.h"
  38. #include "xfs_log.h"
  39. #include "xfs_ag.h"
  40. #include "xfs_mount.h"
  41. #include "xfs_trace.h"
  42. static kmem_zone_t *xfs_buf_zone;
  43. static struct workqueue_struct *xfslogd_workqueue;
  44. #ifdef XFS_BUF_LOCK_TRACKING
  45. # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
  46. # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
  47. # define XB_GET_OWNER(bp) ((bp)->b_last_holder)
  48. #else
  49. # define XB_SET_OWNER(bp) do { } while (0)
  50. # define XB_CLEAR_OWNER(bp) do { } while (0)
  51. # define XB_GET_OWNER(bp) do { } while (0)
  52. #endif
  53. #define xb_to_gfp(flags) \
  54. ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
  55. static inline int
  56. xfs_buf_is_vmapped(
  57. struct xfs_buf *bp)
  58. {
  59. /*
  60. * Return true if the buffer is vmapped.
  61. *
  62. * b_addr is null if the buffer is not mapped, but the code is clever
  63. * enough to know it doesn't have to map a single page, so the check has
  64. * to be both for b_addr and bp->b_page_count > 1.
  65. */
  66. return bp->b_addr && bp->b_page_count > 1;
  67. }
  68. static inline int
  69. xfs_buf_vmap_len(
  70. struct xfs_buf *bp)
  71. {
  72. return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
  73. }
  74. /*
  75. * xfs_buf_lru_add - add a buffer to the LRU.
  76. *
  77. * The LRU takes a new reference to the buffer so that it will only be freed
  78. * once the shrinker takes the buffer off the LRU.
  79. */
  80. static void
  81. xfs_buf_lru_add(
  82. struct xfs_buf *bp)
  83. {
  84. if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
  85. bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
  86. atomic_inc(&bp->b_hold);
  87. }
  88. }
  89. /*
  90. * xfs_buf_lru_del - remove a buffer from the LRU
  91. *
  92. * The unlocked check is safe here because it only occurs when there are not
  93. * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
  94. * to optimise the shrinker removing the buffer from the LRU and calling
  95. * xfs_buf_free().
  96. */
  97. static void
  98. xfs_buf_lru_del(
  99. struct xfs_buf *bp)
  100. {
  101. list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
  102. }
  103. /*
  104. * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  105. * b_lru_ref count so that the buffer is freed immediately when the buffer
  106. * reference count falls to zero. If the buffer is already on the LRU, we need
  107. * to remove the reference that LRU holds on the buffer.
  108. *
  109. * This prevents build-up of stale buffers on the LRU.
  110. */
  111. void
  112. xfs_buf_stale(
  113. struct xfs_buf *bp)
  114. {
  115. ASSERT(xfs_buf_islocked(bp));
  116. bp->b_flags |= XBF_STALE;
  117. /*
  118. * Clear the delwri status so that a delwri queue walker will not
  119. * flush this buffer to disk now that it is stale. The delwri queue has
  120. * a reference to the buffer, so this is safe to do.
  121. */
  122. bp->b_flags &= ~_XBF_DELWRI_Q;
  123. atomic_set(&(bp)->b_lru_ref, 0);
  124. if (!(bp->b_lru_flags & _XBF_LRU_DISPOSE) &&
  125. (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
  126. atomic_dec(&bp->b_hold);
  127. ASSERT(atomic_read(&bp->b_hold) >= 1);
  128. }
  129. static int
  130. xfs_buf_get_maps(
  131. struct xfs_buf *bp,
  132. int map_count)
  133. {
  134. ASSERT(bp->b_maps == NULL);
  135. bp->b_map_count = map_count;
  136. if (map_count == 1) {
  137. bp->b_maps = &bp->__b_map;
  138. return 0;
  139. }
  140. bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
  141. KM_NOFS);
  142. if (!bp->b_maps)
  143. return ENOMEM;
  144. return 0;
  145. }
  146. /*
  147. * Frees b_pages if it was allocated.
  148. */
  149. static void
  150. xfs_buf_free_maps(
  151. struct xfs_buf *bp)
  152. {
  153. if (bp->b_maps != &bp->__b_map) {
  154. kmem_free(bp->b_maps);
  155. bp->b_maps = NULL;
  156. }
  157. }
  158. struct xfs_buf *
  159. _xfs_buf_alloc(
  160. struct xfs_buftarg *target,
  161. struct xfs_buf_map *map,
  162. int nmaps,
  163. xfs_buf_flags_t flags)
  164. {
  165. struct xfs_buf *bp;
  166. int error;
  167. int i;
  168. bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
  169. if (unlikely(!bp))
  170. return NULL;
  171. /*
  172. * We don't want certain flags to appear in b_flags unless they are
  173. * specifically set by later operations on the buffer.
  174. */
  175. flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
  176. atomic_set(&bp->b_hold, 1);
  177. atomic_set(&bp->b_lru_ref, 1);
  178. init_completion(&bp->b_iowait);
  179. INIT_LIST_HEAD(&bp->b_lru);
  180. INIT_LIST_HEAD(&bp->b_list);
  181. RB_CLEAR_NODE(&bp->b_rbnode);
  182. sema_init(&bp->b_sema, 0); /* held, no waiters */
  183. XB_SET_OWNER(bp);
  184. bp->b_target = target;
  185. bp->b_flags = flags;
  186. /*
  187. * Set length and io_length to the same value initially.
  188. * I/O routines should use io_length, which will be the same in
  189. * most cases but may be reset (e.g. XFS recovery).
  190. */
  191. error = xfs_buf_get_maps(bp, nmaps);
  192. if (error) {
  193. kmem_zone_free(xfs_buf_zone, bp);
  194. return NULL;
  195. }
  196. bp->b_bn = map[0].bm_bn;
  197. bp->b_length = 0;
  198. for (i = 0; i < nmaps; i++) {
  199. bp->b_maps[i].bm_bn = map[i].bm_bn;
  200. bp->b_maps[i].bm_len = map[i].bm_len;
  201. bp->b_length += map[i].bm_len;
  202. }
  203. bp->b_io_length = bp->b_length;
  204. atomic_set(&bp->b_pin_count, 0);
  205. init_waitqueue_head(&bp->b_waiters);
  206. XFS_STATS_INC(xb_create);
  207. trace_xfs_buf_init(bp, _RET_IP_);
  208. return bp;
  209. }
  210. /*
  211. * Allocate a page array capable of holding a specified number
  212. * of pages, and point the page buf at it.
  213. */
  214. STATIC int
  215. _xfs_buf_get_pages(
  216. xfs_buf_t *bp,
  217. int page_count,
  218. xfs_buf_flags_t flags)
  219. {
  220. /* Make sure that we have a page list */
  221. if (bp->b_pages == NULL) {
  222. bp->b_page_count = page_count;
  223. if (page_count <= XB_PAGES) {
  224. bp->b_pages = bp->b_page_array;
  225. } else {
  226. bp->b_pages = kmem_alloc(sizeof(struct page *) *
  227. page_count, KM_NOFS);
  228. if (bp->b_pages == NULL)
  229. return -ENOMEM;
  230. }
  231. memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
  232. }
  233. return 0;
  234. }
  235. /*
  236. * Frees b_pages if it was allocated.
  237. */
  238. STATIC void
  239. _xfs_buf_free_pages(
  240. xfs_buf_t *bp)
  241. {
  242. if (bp->b_pages != bp->b_page_array) {
  243. kmem_free(bp->b_pages);
  244. bp->b_pages = NULL;
  245. }
  246. }
  247. /*
  248. * Releases the specified buffer.
  249. *
  250. * The modification state of any associated pages is left unchanged.
  251. * The buffer must not be on any hash - use xfs_buf_rele instead for
  252. * hashed and refcounted buffers
  253. */
  254. void
  255. xfs_buf_free(
  256. xfs_buf_t *bp)
  257. {
  258. trace_xfs_buf_free(bp, _RET_IP_);
  259. ASSERT(list_empty(&bp->b_lru));
  260. if (bp->b_flags & _XBF_PAGES) {
  261. uint i;
  262. if (xfs_buf_is_vmapped(bp))
  263. vm_unmap_ram(bp->b_addr - bp->b_offset,
  264. bp->b_page_count);
  265. for (i = 0; i < bp->b_page_count; i++) {
  266. struct page *page = bp->b_pages[i];
  267. __free_page(page);
  268. }
  269. } else if (bp->b_flags & _XBF_KMEM)
  270. kmem_free(bp->b_addr);
  271. _xfs_buf_free_pages(bp);
  272. xfs_buf_free_maps(bp);
  273. kmem_zone_free(xfs_buf_zone, bp);
  274. }
  275. /*
  276. * Allocates all the pages for buffer in question and builds it's page list.
  277. */
  278. STATIC int
  279. xfs_buf_allocate_memory(
  280. xfs_buf_t *bp,
  281. uint flags)
  282. {
  283. size_t size;
  284. size_t nbytes, offset;
  285. gfp_t gfp_mask = xb_to_gfp(flags);
  286. unsigned short page_count, i;
  287. xfs_off_t start, end;
  288. int error;
  289. /*
  290. * for buffers that are contained within a single page, just allocate
  291. * the memory from the heap - there's no need for the complexity of
  292. * page arrays to keep allocation down to order 0.
  293. */
  294. size = BBTOB(bp->b_length);
  295. if (size < PAGE_SIZE) {
  296. bp->b_addr = kmem_alloc(size, KM_NOFS);
  297. if (!bp->b_addr) {
  298. /* low memory - use alloc_page loop instead */
  299. goto use_alloc_page;
  300. }
  301. if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
  302. ((unsigned long)bp->b_addr & PAGE_MASK)) {
  303. /* b_addr spans two pages - use alloc_page instead */
  304. kmem_free(bp->b_addr);
  305. bp->b_addr = NULL;
  306. goto use_alloc_page;
  307. }
  308. bp->b_offset = offset_in_page(bp->b_addr);
  309. bp->b_pages = bp->b_page_array;
  310. bp->b_pages[0] = virt_to_page(bp->b_addr);
  311. bp->b_page_count = 1;
  312. bp->b_flags |= _XBF_KMEM;
  313. return 0;
  314. }
  315. use_alloc_page:
  316. start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
  317. end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
  318. >> PAGE_SHIFT;
  319. page_count = end - start;
  320. error = _xfs_buf_get_pages(bp, page_count, flags);
  321. if (unlikely(error))
  322. return error;
  323. offset = bp->b_offset;
  324. bp->b_flags |= _XBF_PAGES;
  325. for (i = 0; i < bp->b_page_count; i++) {
  326. struct page *page;
  327. uint retries = 0;
  328. retry:
  329. page = alloc_page(gfp_mask);
  330. if (unlikely(page == NULL)) {
  331. if (flags & XBF_READ_AHEAD) {
  332. bp->b_page_count = i;
  333. error = ENOMEM;
  334. goto out_free_pages;
  335. }
  336. /*
  337. * This could deadlock.
  338. *
  339. * But until all the XFS lowlevel code is revamped to
  340. * handle buffer allocation failures we can't do much.
  341. */
  342. if (!(++retries % 100))
  343. xfs_err(NULL,
  344. "possible memory allocation deadlock in %s (mode:0x%x)",
  345. __func__, gfp_mask);
  346. XFS_STATS_INC(xb_page_retries);
  347. congestion_wait(BLK_RW_ASYNC, HZ/50);
  348. goto retry;
  349. }
  350. XFS_STATS_INC(xb_page_found);
  351. nbytes = min_t(size_t, size, PAGE_SIZE - offset);
  352. size -= nbytes;
  353. bp->b_pages[i] = page;
  354. offset = 0;
  355. }
  356. return 0;
  357. out_free_pages:
  358. for (i = 0; i < bp->b_page_count; i++)
  359. __free_page(bp->b_pages[i]);
  360. return error;
  361. }
  362. /*
  363. * Map buffer into kernel address-space if necessary.
  364. */
  365. STATIC int
  366. _xfs_buf_map_pages(
  367. xfs_buf_t *bp,
  368. uint flags)
  369. {
  370. ASSERT(bp->b_flags & _XBF_PAGES);
  371. if (bp->b_page_count == 1) {
  372. /* A single page buffer is always mappable */
  373. bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
  374. } else if (flags & XBF_UNMAPPED) {
  375. bp->b_addr = NULL;
  376. } else {
  377. int retried = 0;
  378. do {
  379. bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
  380. -1, PAGE_KERNEL);
  381. if (bp->b_addr)
  382. break;
  383. vm_unmap_aliases();
  384. } while (retried++ <= 1);
  385. if (!bp->b_addr)
  386. return -ENOMEM;
  387. bp->b_addr += bp->b_offset;
  388. }
  389. return 0;
  390. }
  391. /*
  392. * Finding and Reading Buffers
  393. */
  394. /*
  395. * Look up, and creates if absent, a lockable buffer for
  396. * a given range of an inode. The buffer is returned
  397. * locked. No I/O is implied by this call.
  398. */
  399. xfs_buf_t *
  400. _xfs_buf_find(
  401. struct xfs_buftarg *btp,
  402. struct xfs_buf_map *map,
  403. int nmaps,
  404. xfs_buf_flags_t flags,
  405. xfs_buf_t *new_bp)
  406. {
  407. size_t numbytes;
  408. struct xfs_perag *pag;
  409. struct rb_node **rbp;
  410. struct rb_node *parent;
  411. xfs_buf_t *bp;
  412. xfs_daddr_t blkno = map[0].bm_bn;
  413. xfs_daddr_t eofs;
  414. int numblks = 0;
  415. int i;
  416. for (i = 0; i < nmaps; i++)
  417. numblks += map[i].bm_len;
  418. numbytes = BBTOB(numblks);
  419. /* Check for IOs smaller than the sector size / not sector aligned */
  420. ASSERT(!(numbytes < (1 << btp->bt_sshift)));
  421. ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
  422. /*
  423. * Corrupted block numbers can get through to here, unfortunately, so we
  424. * have to check that the buffer falls within the filesystem bounds.
  425. */
  426. eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
  427. if (blkno >= eofs) {
  428. /*
  429. * XXX (dgc): we should really be returning EFSCORRUPTED here,
  430. * but none of the higher level infrastructure supports
  431. * returning a specific error on buffer lookup failures.
  432. */
  433. xfs_alert(btp->bt_mount,
  434. "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
  435. __func__, blkno, eofs);
  436. WARN_ON(1);
  437. return NULL;
  438. }
  439. /* get tree root */
  440. pag = xfs_perag_get(btp->bt_mount,
  441. xfs_daddr_to_agno(btp->bt_mount, blkno));
  442. /* walk tree */
  443. spin_lock(&pag->pag_buf_lock);
  444. rbp = &pag->pag_buf_tree.rb_node;
  445. parent = NULL;
  446. bp = NULL;
  447. while (*rbp) {
  448. parent = *rbp;
  449. bp = rb_entry(parent, struct xfs_buf, b_rbnode);
  450. if (blkno < bp->b_bn)
  451. rbp = &(*rbp)->rb_left;
  452. else if (blkno > bp->b_bn)
  453. rbp = &(*rbp)->rb_right;
  454. else {
  455. /*
  456. * found a block number match. If the range doesn't
  457. * match, the only way this is allowed is if the buffer
  458. * in the cache is stale and the transaction that made
  459. * it stale has not yet committed. i.e. we are
  460. * reallocating a busy extent. Skip this buffer and
  461. * continue searching to the right for an exact match.
  462. */
  463. if (bp->b_length != numblks) {
  464. ASSERT(bp->b_flags & XBF_STALE);
  465. rbp = &(*rbp)->rb_right;
  466. continue;
  467. }
  468. atomic_inc(&bp->b_hold);
  469. goto found;
  470. }
  471. }
  472. /* No match found */
  473. if (new_bp) {
  474. rb_link_node(&new_bp->b_rbnode, parent, rbp);
  475. rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
  476. /* the buffer keeps the perag reference until it is freed */
  477. new_bp->b_pag = pag;
  478. spin_unlock(&pag->pag_buf_lock);
  479. } else {
  480. XFS_STATS_INC(xb_miss_locked);
  481. spin_unlock(&pag->pag_buf_lock);
  482. xfs_perag_put(pag);
  483. }
  484. return new_bp;
  485. found:
  486. spin_unlock(&pag->pag_buf_lock);
  487. xfs_perag_put(pag);
  488. if (!xfs_buf_trylock(bp)) {
  489. if (flags & XBF_TRYLOCK) {
  490. xfs_buf_rele(bp);
  491. XFS_STATS_INC(xb_busy_locked);
  492. return NULL;
  493. }
  494. xfs_buf_lock(bp);
  495. XFS_STATS_INC(xb_get_locked_waited);
  496. }
  497. /*
  498. * if the buffer is stale, clear all the external state associated with
  499. * it. We need to keep flags such as how we allocated the buffer memory
  500. * intact here.
  501. */
  502. if (bp->b_flags & XBF_STALE) {
  503. ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
  504. ASSERT(bp->b_iodone == NULL);
  505. bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
  506. bp->b_ops = NULL;
  507. }
  508. trace_xfs_buf_find(bp, flags, _RET_IP_);
  509. XFS_STATS_INC(xb_get_locked);
  510. return bp;
  511. }
  512. /*
  513. * Assembles a buffer covering the specified range. The code is optimised for
  514. * cache hits, as metadata intensive workloads will see 3 orders of magnitude
  515. * more hits than misses.
  516. */
  517. struct xfs_buf *
  518. xfs_buf_get_map(
  519. struct xfs_buftarg *target,
  520. struct xfs_buf_map *map,
  521. int nmaps,
  522. xfs_buf_flags_t flags)
  523. {
  524. struct xfs_buf *bp;
  525. struct xfs_buf *new_bp;
  526. int error = 0;
  527. bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
  528. if (likely(bp))
  529. goto found;
  530. new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
  531. if (unlikely(!new_bp))
  532. return NULL;
  533. error = xfs_buf_allocate_memory(new_bp, flags);
  534. if (error) {
  535. xfs_buf_free(new_bp);
  536. return NULL;
  537. }
  538. bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
  539. if (!bp) {
  540. xfs_buf_free(new_bp);
  541. return NULL;
  542. }
  543. if (bp != new_bp)
  544. xfs_buf_free(new_bp);
  545. found:
  546. if (!bp->b_addr) {
  547. error = _xfs_buf_map_pages(bp, flags);
  548. if (unlikely(error)) {
  549. xfs_warn(target->bt_mount,
  550. "%s: failed to map pages\n", __func__);
  551. xfs_buf_relse(bp);
  552. return NULL;
  553. }
  554. }
  555. XFS_STATS_INC(xb_get);
  556. trace_xfs_buf_get(bp, flags, _RET_IP_);
  557. return bp;
  558. }
  559. STATIC int
  560. _xfs_buf_read(
  561. xfs_buf_t *bp,
  562. xfs_buf_flags_t flags)
  563. {
  564. ASSERT(!(flags & XBF_WRITE));
  565. ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
  566. bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
  567. bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
  568. xfs_buf_iorequest(bp);
  569. if (flags & XBF_ASYNC)
  570. return 0;
  571. return xfs_buf_iowait(bp);
  572. }
  573. xfs_buf_t *
  574. xfs_buf_read_map(
  575. struct xfs_buftarg *target,
  576. struct xfs_buf_map *map,
  577. int nmaps,
  578. xfs_buf_flags_t flags,
  579. const struct xfs_buf_ops *ops)
  580. {
  581. struct xfs_buf *bp;
  582. flags |= XBF_READ;
  583. bp = xfs_buf_get_map(target, map, nmaps, flags);
  584. if (bp) {
  585. trace_xfs_buf_read(bp, flags, _RET_IP_);
  586. if (!XFS_BUF_ISDONE(bp)) {
  587. XFS_STATS_INC(xb_get_read);
  588. bp->b_ops = ops;
  589. _xfs_buf_read(bp, flags);
  590. } else if (flags & XBF_ASYNC) {
  591. /*
  592. * Read ahead call which is already satisfied,
  593. * drop the buffer
  594. */
  595. xfs_buf_relse(bp);
  596. return NULL;
  597. } else {
  598. /* We do not want read in the flags */
  599. bp->b_flags &= ~XBF_READ;
  600. }
  601. }
  602. return bp;
  603. }
  604. /*
  605. * If we are not low on memory then do the readahead in a deadlock
  606. * safe manner.
  607. */
  608. void
  609. xfs_buf_readahead_map(
  610. struct xfs_buftarg *target,
  611. struct xfs_buf_map *map,
  612. int nmaps,
  613. const struct xfs_buf_ops *ops)
  614. {
  615. if (bdi_read_congested(target->bt_bdi))
  616. return;
  617. xfs_buf_read_map(target, map, nmaps,
  618. XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
  619. }
  620. /*
  621. * Read an uncached buffer from disk. Allocates and returns a locked
  622. * buffer containing the disk contents or nothing.
  623. */
  624. struct xfs_buf *
  625. xfs_buf_read_uncached(
  626. struct xfs_buftarg *target,
  627. xfs_daddr_t daddr,
  628. size_t numblks,
  629. int flags,
  630. const struct xfs_buf_ops *ops)
  631. {
  632. struct xfs_buf *bp;
  633. bp = xfs_buf_get_uncached(target, numblks, flags);
  634. if (!bp)
  635. return NULL;
  636. /* set up the buffer for a read IO */
  637. ASSERT(bp->b_map_count == 1);
  638. bp->b_bn = daddr;
  639. bp->b_maps[0].bm_bn = daddr;
  640. bp->b_flags |= XBF_READ;
  641. bp->b_ops = ops;
  642. xfsbdstrat(target->bt_mount, bp);
  643. xfs_buf_iowait(bp);
  644. return bp;
  645. }
  646. /*
  647. * Return a buffer allocated as an empty buffer and associated to external
  648. * memory via xfs_buf_associate_memory() back to it's empty state.
  649. */
  650. void
  651. xfs_buf_set_empty(
  652. struct xfs_buf *bp,
  653. size_t numblks)
  654. {
  655. if (bp->b_pages)
  656. _xfs_buf_free_pages(bp);
  657. bp->b_pages = NULL;
  658. bp->b_page_count = 0;
  659. bp->b_addr = NULL;
  660. bp->b_length = numblks;
  661. bp->b_io_length = numblks;
  662. ASSERT(bp->b_map_count == 1);
  663. bp->b_bn = XFS_BUF_DADDR_NULL;
  664. bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
  665. bp->b_maps[0].bm_len = bp->b_length;
  666. }
  667. static inline struct page *
  668. mem_to_page(
  669. void *addr)
  670. {
  671. if ((!is_vmalloc_addr(addr))) {
  672. return virt_to_page(addr);
  673. } else {
  674. return vmalloc_to_page(addr);
  675. }
  676. }
  677. int
  678. xfs_buf_associate_memory(
  679. xfs_buf_t *bp,
  680. void *mem,
  681. size_t len)
  682. {
  683. int rval;
  684. int i = 0;
  685. unsigned long pageaddr;
  686. unsigned long offset;
  687. size_t buflen;
  688. int page_count;
  689. pageaddr = (unsigned long)mem & PAGE_MASK;
  690. offset = (unsigned long)mem - pageaddr;
  691. buflen = PAGE_ALIGN(len + offset);
  692. page_count = buflen >> PAGE_SHIFT;
  693. /* Free any previous set of page pointers */
  694. if (bp->b_pages)
  695. _xfs_buf_free_pages(bp);
  696. bp->b_pages = NULL;
  697. bp->b_addr = mem;
  698. rval = _xfs_buf_get_pages(bp, page_count, 0);
  699. if (rval)
  700. return rval;
  701. bp->b_offset = offset;
  702. for (i = 0; i < bp->b_page_count; i++) {
  703. bp->b_pages[i] = mem_to_page((void *)pageaddr);
  704. pageaddr += PAGE_SIZE;
  705. }
  706. bp->b_io_length = BTOBB(len);
  707. bp->b_length = BTOBB(buflen);
  708. return 0;
  709. }
  710. xfs_buf_t *
  711. xfs_buf_get_uncached(
  712. struct xfs_buftarg *target,
  713. size_t numblks,
  714. int flags)
  715. {
  716. unsigned long page_count;
  717. int error, i;
  718. struct xfs_buf *bp;
  719. DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
  720. bp = _xfs_buf_alloc(target, &map, 1, 0);
  721. if (unlikely(bp == NULL))
  722. goto fail;
  723. page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
  724. error = _xfs_buf_get_pages(bp, page_count, 0);
  725. if (error)
  726. goto fail_free_buf;
  727. for (i = 0; i < page_count; i++) {
  728. bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
  729. if (!bp->b_pages[i])
  730. goto fail_free_mem;
  731. }
  732. bp->b_flags |= _XBF_PAGES;
  733. error = _xfs_buf_map_pages(bp, 0);
  734. if (unlikely(error)) {
  735. xfs_warn(target->bt_mount,
  736. "%s: failed to map pages\n", __func__);
  737. goto fail_free_mem;
  738. }
  739. trace_xfs_buf_get_uncached(bp, _RET_IP_);
  740. return bp;
  741. fail_free_mem:
  742. while (--i >= 0)
  743. __free_page(bp->b_pages[i]);
  744. _xfs_buf_free_pages(bp);
  745. fail_free_buf:
  746. xfs_buf_free_maps(bp);
  747. kmem_zone_free(xfs_buf_zone, bp);
  748. fail:
  749. return NULL;
  750. }
  751. /*
  752. * Increment reference count on buffer, to hold the buffer concurrently
  753. * with another thread which may release (free) the buffer asynchronously.
  754. * Must hold the buffer already to call this function.
  755. */
  756. void
  757. xfs_buf_hold(
  758. xfs_buf_t *bp)
  759. {
  760. trace_xfs_buf_hold(bp, _RET_IP_);
  761. atomic_inc(&bp->b_hold);
  762. }
  763. /*
  764. * Releases a hold on the specified buffer. If the
  765. * the hold count is 1, calls xfs_buf_free.
  766. */
  767. void
  768. xfs_buf_rele(
  769. xfs_buf_t *bp)
  770. {
  771. struct xfs_perag *pag = bp->b_pag;
  772. trace_xfs_buf_rele(bp, _RET_IP_);
  773. if (!pag) {
  774. ASSERT(list_empty(&bp->b_lru));
  775. ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
  776. if (atomic_dec_and_test(&bp->b_hold))
  777. xfs_buf_free(bp);
  778. return;
  779. }
  780. ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
  781. ASSERT(atomic_read(&bp->b_hold) > 0);
  782. if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
  783. if (!(bp->b_flags & XBF_STALE) &&
  784. atomic_read(&bp->b_lru_ref)) {
  785. xfs_buf_lru_add(bp);
  786. spin_unlock(&pag->pag_buf_lock);
  787. } else {
  788. xfs_buf_lru_del(bp);
  789. ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
  790. rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
  791. spin_unlock(&pag->pag_buf_lock);
  792. xfs_perag_put(pag);
  793. xfs_buf_free(bp);
  794. }
  795. }
  796. }
  797. /*
  798. * Lock a buffer object, if it is not already locked.
  799. *
  800. * If we come across a stale, pinned, locked buffer, we know that we are
  801. * being asked to lock a buffer that has been reallocated. Because it is
  802. * pinned, we know that the log has not been pushed to disk and hence it
  803. * will still be locked. Rather than continuing to have trylock attempts
  804. * fail until someone else pushes the log, push it ourselves before
  805. * returning. This means that the xfsaild will not get stuck trying
  806. * to push on stale inode buffers.
  807. */
  808. int
  809. xfs_buf_trylock(
  810. struct xfs_buf *bp)
  811. {
  812. int locked;
  813. locked = down_trylock(&bp->b_sema) == 0;
  814. if (locked)
  815. XB_SET_OWNER(bp);
  816. trace_xfs_buf_trylock(bp, _RET_IP_);
  817. return locked;
  818. }
  819. /*
  820. * Lock a buffer object.
  821. *
  822. * If we come across a stale, pinned, locked buffer, we know that we
  823. * are being asked to lock a buffer that has been reallocated. Because
  824. * it is pinned, we know that the log has not been pushed to disk and
  825. * hence it will still be locked. Rather than sleeping until someone
  826. * else pushes the log, push it ourselves before trying to get the lock.
  827. */
  828. void
  829. xfs_buf_lock(
  830. struct xfs_buf *bp)
  831. {
  832. trace_xfs_buf_lock(bp, _RET_IP_);
  833. if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
  834. xfs_log_force(bp->b_target->bt_mount, 0);
  835. down(&bp->b_sema);
  836. XB_SET_OWNER(bp);
  837. trace_xfs_buf_lock_done(bp, _RET_IP_);
  838. }
  839. void
  840. xfs_buf_unlock(
  841. struct xfs_buf *bp)
  842. {
  843. XB_CLEAR_OWNER(bp);
  844. up(&bp->b_sema);
  845. trace_xfs_buf_unlock(bp, _RET_IP_);
  846. }
  847. STATIC void
  848. xfs_buf_wait_unpin(
  849. xfs_buf_t *bp)
  850. {
  851. DECLARE_WAITQUEUE (wait, current);
  852. if (atomic_read(&bp->b_pin_count) == 0)
  853. return;
  854. add_wait_queue(&bp->b_waiters, &wait);
  855. for (;;) {
  856. set_current_state(TASK_UNINTERRUPTIBLE);
  857. if (atomic_read(&bp->b_pin_count) == 0)
  858. break;
  859. io_schedule();
  860. }
  861. remove_wait_queue(&bp->b_waiters, &wait);
  862. set_current_state(TASK_RUNNING);
  863. }
  864. /*
  865. * Buffer Utility Routines
  866. */
  867. STATIC void
  868. xfs_buf_iodone_work(
  869. struct work_struct *work)
  870. {
  871. struct xfs_buf *bp =
  872. container_of(work, xfs_buf_t, b_iodone_work);
  873. bool read = !!(bp->b_flags & XBF_READ);
  874. bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
  875. /* only validate buffers that were read without errors */
  876. if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
  877. bp->b_ops->verify_read(bp);
  878. if (bp->b_iodone)
  879. (*(bp->b_iodone))(bp);
  880. else if (bp->b_flags & XBF_ASYNC)
  881. xfs_buf_relse(bp);
  882. else {
  883. ASSERT(read && bp->b_ops);
  884. complete(&bp->b_iowait);
  885. }
  886. }
  887. void
  888. xfs_buf_ioend(
  889. struct xfs_buf *bp,
  890. int schedule)
  891. {
  892. bool read = !!(bp->b_flags & XBF_READ);
  893. trace_xfs_buf_iodone(bp, _RET_IP_);
  894. if (bp->b_error == 0)
  895. bp->b_flags |= XBF_DONE;
  896. if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
  897. if (schedule) {
  898. INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
  899. queue_work(xfslogd_workqueue, &bp->b_iodone_work);
  900. } else {
  901. xfs_buf_iodone_work(&bp->b_iodone_work);
  902. }
  903. } else {
  904. bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
  905. complete(&bp->b_iowait);
  906. }
  907. }
  908. void
  909. xfs_buf_ioerror(
  910. xfs_buf_t *bp,
  911. int error)
  912. {
  913. ASSERT(error >= 0 && error <= 0xffff);
  914. bp->b_error = (unsigned short)error;
  915. trace_xfs_buf_ioerror(bp, error, _RET_IP_);
  916. }
  917. void
  918. xfs_buf_ioerror_alert(
  919. struct xfs_buf *bp,
  920. const char *func)
  921. {
  922. xfs_alert(bp->b_target->bt_mount,
  923. "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
  924. (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
  925. }
  926. /*
  927. * Called when we want to stop a buffer from getting written or read.
  928. * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
  929. * so that the proper iodone callbacks get called.
  930. */
  931. STATIC int
  932. xfs_bioerror(
  933. xfs_buf_t *bp)
  934. {
  935. #ifdef XFSERRORDEBUG
  936. ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
  937. #endif
  938. /*
  939. * No need to wait until the buffer is unpinned, we aren't flushing it.
  940. */
  941. xfs_buf_ioerror(bp, EIO);
  942. /*
  943. * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
  944. */
  945. XFS_BUF_UNREAD(bp);
  946. XFS_BUF_UNDONE(bp);
  947. xfs_buf_stale(bp);
  948. xfs_buf_ioend(bp, 0);
  949. return EIO;
  950. }
  951. /*
  952. * Same as xfs_bioerror, except that we are releasing the buffer
  953. * here ourselves, and avoiding the xfs_buf_ioend call.
  954. * This is meant for userdata errors; metadata bufs come with
  955. * iodone functions attached, so that we can track down errors.
  956. */
  957. STATIC int
  958. xfs_bioerror_relse(
  959. struct xfs_buf *bp)
  960. {
  961. int64_t fl = bp->b_flags;
  962. /*
  963. * No need to wait until the buffer is unpinned.
  964. * We aren't flushing it.
  965. *
  966. * chunkhold expects B_DONE to be set, whether
  967. * we actually finish the I/O or not. We don't want to
  968. * change that interface.
  969. */
  970. XFS_BUF_UNREAD(bp);
  971. XFS_BUF_DONE(bp);
  972. xfs_buf_stale(bp);
  973. bp->b_iodone = NULL;
  974. if (!(fl & XBF_ASYNC)) {
  975. /*
  976. * Mark b_error and B_ERROR _both_.
  977. * Lot's of chunkcache code assumes that.
  978. * There's no reason to mark error for
  979. * ASYNC buffers.
  980. */
  981. xfs_buf_ioerror(bp, EIO);
  982. complete(&bp->b_iowait);
  983. } else {
  984. xfs_buf_relse(bp);
  985. }
  986. return EIO;
  987. }
  988. STATIC int
  989. xfs_bdstrat_cb(
  990. struct xfs_buf *bp)
  991. {
  992. if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
  993. trace_xfs_bdstrat_shut(bp, _RET_IP_);
  994. /*
  995. * Metadata write that didn't get logged but
  996. * written delayed anyway. These aren't associated
  997. * with a transaction, and can be ignored.
  998. */
  999. if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
  1000. return xfs_bioerror_relse(bp);
  1001. else
  1002. return xfs_bioerror(bp);
  1003. }
  1004. xfs_buf_iorequest(bp);
  1005. return 0;
  1006. }
  1007. int
  1008. xfs_bwrite(
  1009. struct xfs_buf *bp)
  1010. {
  1011. int error;
  1012. ASSERT(xfs_buf_islocked(bp));
  1013. bp->b_flags |= XBF_WRITE;
  1014. bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
  1015. xfs_bdstrat_cb(bp);
  1016. error = xfs_buf_iowait(bp);
  1017. if (error) {
  1018. xfs_force_shutdown(bp->b_target->bt_mount,
  1019. SHUTDOWN_META_IO_ERROR);
  1020. }
  1021. return error;
  1022. }
  1023. /*
  1024. * Wrapper around bdstrat so that we can stop data from going to disk in case
  1025. * we are shutting down the filesystem. Typically user data goes thru this
  1026. * path; one of the exceptions is the superblock.
  1027. */
  1028. void
  1029. xfsbdstrat(
  1030. struct xfs_mount *mp,
  1031. struct xfs_buf *bp)
  1032. {
  1033. if (XFS_FORCED_SHUTDOWN(mp)) {
  1034. trace_xfs_bdstrat_shut(bp, _RET_IP_);
  1035. xfs_bioerror_relse(bp);
  1036. return;
  1037. }
  1038. xfs_buf_iorequest(bp);
  1039. }
  1040. STATIC void
  1041. _xfs_buf_ioend(
  1042. xfs_buf_t *bp,
  1043. int schedule)
  1044. {
  1045. if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
  1046. xfs_buf_ioend(bp, schedule);
  1047. }
  1048. STATIC void
  1049. xfs_buf_bio_end_io(
  1050. struct bio *bio,
  1051. int error)
  1052. {
  1053. xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
  1054. /*
  1055. * don't overwrite existing errors - otherwise we can lose errors on
  1056. * buffers that require multiple bios to complete.
  1057. */
  1058. if (!bp->b_error)
  1059. xfs_buf_ioerror(bp, -error);
  1060. if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
  1061. invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
  1062. _xfs_buf_ioend(bp, 1);
  1063. bio_put(bio);
  1064. }
  1065. static void
  1066. xfs_buf_ioapply_map(
  1067. struct xfs_buf *bp,
  1068. int map,
  1069. int *buf_offset,
  1070. int *count,
  1071. int rw)
  1072. {
  1073. int page_index;
  1074. int total_nr_pages = bp->b_page_count;
  1075. int nr_pages;
  1076. struct bio *bio;
  1077. sector_t sector = bp->b_maps[map].bm_bn;
  1078. int size;
  1079. int offset;
  1080. total_nr_pages = bp->b_page_count;
  1081. /* skip the pages in the buffer before the start offset */
  1082. page_index = 0;
  1083. offset = *buf_offset;
  1084. while (offset >= PAGE_SIZE) {
  1085. page_index++;
  1086. offset -= PAGE_SIZE;
  1087. }
  1088. /*
  1089. * Limit the IO size to the length of the current vector, and update the
  1090. * remaining IO count for the next time around.
  1091. */
  1092. size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
  1093. *count -= size;
  1094. *buf_offset += size;
  1095. next_chunk:
  1096. atomic_inc(&bp->b_io_remaining);
  1097. nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
  1098. if (nr_pages > total_nr_pages)
  1099. nr_pages = total_nr_pages;
  1100. bio = bio_alloc(GFP_NOIO, nr_pages);
  1101. bio->bi_bdev = bp->b_target->bt_bdev;
  1102. bio->bi_sector = sector;
  1103. bio->bi_end_io = xfs_buf_bio_end_io;
  1104. bio->bi_private = bp;
  1105. for (; size && nr_pages; nr_pages--, page_index++) {
  1106. int rbytes, nbytes = PAGE_SIZE - offset;
  1107. if (nbytes > size)
  1108. nbytes = size;
  1109. rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
  1110. offset);
  1111. if (rbytes < nbytes)
  1112. break;
  1113. offset = 0;
  1114. sector += BTOBB(nbytes);
  1115. size -= nbytes;
  1116. total_nr_pages--;
  1117. }
  1118. if (likely(bio->bi_size)) {
  1119. if (xfs_buf_is_vmapped(bp)) {
  1120. flush_kernel_vmap_range(bp->b_addr,
  1121. xfs_buf_vmap_len(bp));
  1122. }
  1123. submit_bio(rw, bio);
  1124. if (size)
  1125. goto next_chunk;
  1126. } else {
  1127. /*
  1128. * This is guaranteed not to be the last io reference count
  1129. * because the caller (xfs_buf_iorequest) holds a count itself.
  1130. */
  1131. atomic_dec(&bp->b_io_remaining);
  1132. xfs_buf_ioerror(bp, EIO);
  1133. bio_put(bio);
  1134. }
  1135. }
  1136. STATIC void
  1137. _xfs_buf_ioapply(
  1138. struct xfs_buf *bp)
  1139. {
  1140. struct blk_plug plug;
  1141. int rw;
  1142. int offset;
  1143. int size;
  1144. int i;
  1145. /*
  1146. * Make sure we capture only current IO errors rather than stale errors
  1147. * left over from previous use of the buffer (e.g. failed readahead).
  1148. */
  1149. bp->b_error = 0;
  1150. if (bp->b_flags & XBF_WRITE) {
  1151. if (bp->b_flags & XBF_SYNCIO)
  1152. rw = WRITE_SYNC;
  1153. else
  1154. rw = WRITE;
  1155. if (bp->b_flags & XBF_FUA)
  1156. rw |= REQ_FUA;
  1157. if (bp->b_flags & XBF_FLUSH)
  1158. rw |= REQ_FLUSH;
  1159. /*
  1160. * Run the write verifier callback function if it exists. If
  1161. * this function fails it will mark the buffer with an error and
  1162. * the IO should not be dispatched.
  1163. */
  1164. if (bp->b_ops) {
  1165. bp->b_ops->verify_write(bp);
  1166. if (bp->b_error) {
  1167. xfs_force_shutdown(bp->b_target->bt_mount,
  1168. SHUTDOWN_CORRUPT_INCORE);
  1169. return;
  1170. }
  1171. }
  1172. } else if (bp->b_flags & XBF_READ_AHEAD) {
  1173. rw = READA;
  1174. } else {
  1175. rw = READ;
  1176. }
  1177. /* we only use the buffer cache for meta-data */
  1178. rw |= REQ_META;
  1179. /*
  1180. * Walk all the vectors issuing IO on them. Set up the initial offset
  1181. * into the buffer and the desired IO size before we start -
  1182. * _xfs_buf_ioapply_vec() will modify them appropriately for each
  1183. * subsequent call.
  1184. */
  1185. offset = bp->b_offset;
  1186. size = BBTOB(bp->b_io_length);
  1187. blk_start_plug(&plug);
  1188. for (i = 0; i < bp->b_map_count; i++) {
  1189. xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
  1190. if (bp->b_error)
  1191. break;
  1192. if (size <= 0)
  1193. break; /* all done */
  1194. }
  1195. blk_finish_plug(&plug);
  1196. }
  1197. void
  1198. xfs_buf_iorequest(
  1199. xfs_buf_t *bp)
  1200. {
  1201. trace_xfs_buf_iorequest(bp, _RET_IP_);
  1202. ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
  1203. if (bp->b_flags & XBF_WRITE)
  1204. xfs_buf_wait_unpin(bp);
  1205. xfs_buf_hold(bp);
  1206. /* Set the count to 1 initially, this will stop an I/O
  1207. * completion callout which happens before we have started
  1208. * all the I/O from calling xfs_buf_ioend too early.
  1209. */
  1210. atomic_set(&bp->b_io_remaining, 1);
  1211. _xfs_buf_ioapply(bp);
  1212. _xfs_buf_ioend(bp, 1);
  1213. xfs_buf_rele(bp);
  1214. }
  1215. /*
  1216. * Waits for I/O to complete on the buffer supplied. It returns immediately if
  1217. * no I/O is pending or there is already a pending error on the buffer. It
  1218. * returns the I/O error code, if any, or 0 if there was no error.
  1219. */
  1220. int
  1221. xfs_buf_iowait(
  1222. xfs_buf_t *bp)
  1223. {
  1224. trace_xfs_buf_iowait(bp, _RET_IP_);
  1225. if (!bp->b_error)
  1226. wait_for_completion(&bp->b_iowait);
  1227. trace_xfs_buf_iowait_done(bp, _RET_IP_);
  1228. return bp->b_error;
  1229. }
  1230. xfs_caddr_t
  1231. xfs_buf_offset(
  1232. xfs_buf_t *bp,
  1233. size_t offset)
  1234. {
  1235. struct page *page;
  1236. if (bp->b_addr)
  1237. return bp->b_addr + offset;
  1238. offset += bp->b_offset;
  1239. page = bp->b_pages[offset >> PAGE_SHIFT];
  1240. return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
  1241. }
  1242. /*
  1243. * Move data into or out of a buffer.
  1244. */
  1245. void
  1246. xfs_buf_iomove(
  1247. xfs_buf_t *bp, /* buffer to process */
  1248. size_t boff, /* starting buffer offset */
  1249. size_t bsize, /* length to copy */
  1250. void *data, /* data address */
  1251. xfs_buf_rw_t mode) /* read/write/zero flag */
  1252. {
  1253. size_t bend;
  1254. bend = boff + bsize;
  1255. while (boff < bend) {
  1256. struct page *page;
  1257. int page_index, page_offset, csize;
  1258. page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
  1259. page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
  1260. page = bp->b_pages[page_index];
  1261. csize = min_t(size_t, PAGE_SIZE - page_offset,
  1262. BBTOB(bp->b_io_length) - boff);
  1263. ASSERT((csize + page_offset) <= PAGE_SIZE);
  1264. switch (mode) {
  1265. case XBRW_ZERO:
  1266. memset(page_address(page) + page_offset, 0, csize);
  1267. break;
  1268. case XBRW_READ:
  1269. memcpy(data, page_address(page) + page_offset, csize);
  1270. break;
  1271. case XBRW_WRITE:
  1272. memcpy(page_address(page) + page_offset, data, csize);
  1273. }
  1274. boff += csize;
  1275. data += csize;
  1276. }
  1277. }
  1278. /*
  1279. * Handling of buffer targets (buftargs).
  1280. */
  1281. /*
  1282. * Wait for any bufs with callbacks that have been submitted but have not yet
  1283. * returned. These buffers will have an elevated hold count, so wait on those
  1284. * while freeing all the buffers only held by the LRU.
  1285. */
  1286. static enum lru_status
  1287. xfs_buftarg_wait_rele(
  1288. struct list_head *item,
  1289. spinlock_t *lru_lock,
  1290. void *arg)
  1291. {
  1292. struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
  1293. if (atomic_read(&bp->b_hold) > 1) {
  1294. /* need to wait */
  1295. trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
  1296. spin_unlock(lru_lock);
  1297. delay(100);
  1298. } else {
  1299. /*
  1300. * clear the LRU reference count so the buffer doesn't get
  1301. * ignored in xfs_buf_rele().
  1302. */
  1303. atomic_set(&bp->b_lru_ref, 0);
  1304. spin_unlock(lru_lock);
  1305. xfs_buf_rele(bp);
  1306. }
  1307. spin_lock(lru_lock);
  1308. return LRU_RETRY;
  1309. }
  1310. void
  1311. xfs_wait_buftarg(
  1312. struct xfs_buftarg *btp)
  1313. {
  1314. while (list_lru_count(&btp->bt_lru))
  1315. list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
  1316. NULL, LONG_MAX);
  1317. }
  1318. static enum lru_status
  1319. xfs_buftarg_isolate(
  1320. struct list_head *item,
  1321. spinlock_t *lru_lock,
  1322. void *arg)
  1323. {
  1324. struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
  1325. struct list_head *dispose = arg;
  1326. /*
  1327. * Decrement the b_lru_ref count unless the value is already
  1328. * zero. If the value is already zero, we need to reclaim the
  1329. * buffer, otherwise it gets another trip through the LRU.
  1330. */
  1331. if (!atomic_add_unless(&bp->b_lru_ref, -1, 0))
  1332. return LRU_ROTATE;
  1333. bp->b_lru_flags |= _XBF_LRU_DISPOSE;
  1334. list_move(item, dispose);
  1335. return LRU_REMOVED;
  1336. }
  1337. static long
  1338. xfs_buftarg_shrink_scan(
  1339. struct shrinker *shrink,
  1340. struct shrink_control *sc)
  1341. {
  1342. struct xfs_buftarg *btp = container_of(shrink,
  1343. struct xfs_buftarg, bt_shrinker);
  1344. LIST_HEAD(dispose);
  1345. long freed;
  1346. unsigned long nr_to_scan = sc->nr_to_scan;
  1347. freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
  1348. &dispose, &nr_to_scan);
  1349. while (!list_empty(&dispose)) {
  1350. struct xfs_buf *bp;
  1351. bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
  1352. list_del_init(&bp->b_lru);
  1353. xfs_buf_rele(bp);
  1354. }
  1355. return freed;
  1356. }
  1357. static long
  1358. xfs_buftarg_shrink_count(
  1359. struct shrinker *shrink,
  1360. struct shrink_control *sc)
  1361. {
  1362. struct xfs_buftarg *btp = container_of(shrink,
  1363. struct xfs_buftarg, bt_shrinker);
  1364. return list_lru_count_node(&btp->bt_lru, sc->nid);
  1365. }
  1366. void
  1367. xfs_free_buftarg(
  1368. struct xfs_mount *mp,
  1369. struct xfs_buftarg *btp)
  1370. {
  1371. unregister_shrinker(&btp->bt_shrinker);
  1372. if (mp->m_flags & XFS_MOUNT_BARRIER)
  1373. xfs_blkdev_issue_flush(btp);
  1374. kmem_free(btp);
  1375. }
  1376. STATIC int
  1377. xfs_setsize_buftarg_flags(
  1378. xfs_buftarg_t *btp,
  1379. unsigned int blocksize,
  1380. unsigned int sectorsize,
  1381. int verbose)
  1382. {
  1383. btp->bt_bsize = blocksize;
  1384. btp->bt_sshift = ffs(sectorsize) - 1;
  1385. btp->bt_smask = sectorsize - 1;
  1386. if (set_blocksize(btp->bt_bdev, sectorsize)) {
  1387. char name[BDEVNAME_SIZE];
  1388. bdevname(btp->bt_bdev, name);
  1389. xfs_warn(btp->bt_mount,
  1390. "Cannot set_blocksize to %u on device %s\n",
  1391. sectorsize, name);
  1392. return EINVAL;
  1393. }
  1394. return 0;
  1395. }
  1396. /*
  1397. * When allocating the initial buffer target we have not yet
  1398. * read in the superblock, so don't know what sized sectors
  1399. * are being used at this early stage. Play safe.
  1400. */
  1401. STATIC int
  1402. xfs_setsize_buftarg_early(
  1403. xfs_buftarg_t *btp,
  1404. struct block_device *bdev)
  1405. {
  1406. return xfs_setsize_buftarg_flags(btp,
  1407. PAGE_SIZE, bdev_logical_block_size(bdev), 0);
  1408. }
  1409. int
  1410. xfs_setsize_buftarg(
  1411. xfs_buftarg_t *btp,
  1412. unsigned int blocksize,
  1413. unsigned int sectorsize)
  1414. {
  1415. return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
  1416. }
  1417. xfs_buftarg_t *
  1418. xfs_alloc_buftarg(
  1419. struct xfs_mount *mp,
  1420. struct block_device *bdev,
  1421. int external,
  1422. const char *fsname)
  1423. {
  1424. xfs_buftarg_t *btp;
  1425. btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
  1426. btp->bt_mount = mp;
  1427. btp->bt_dev = bdev->bd_dev;
  1428. btp->bt_bdev = bdev;
  1429. btp->bt_bdi = blk_get_backing_dev_info(bdev);
  1430. if (!btp->bt_bdi)
  1431. goto error;
  1432. list_lru_init(&btp->bt_lru);
  1433. if (xfs_setsize_buftarg_early(btp, bdev))
  1434. goto error;
  1435. btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
  1436. btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
  1437. btp->bt_shrinker.seeks = DEFAULT_SEEKS;
  1438. btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
  1439. register_shrinker(&btp->bt_shrinker);
  1440. return btp;
  1441. error:
  1442. kmem_free(btp);
  1443. return NULL;
  1444. }
  1445. /*
  1446. * Add a buffer to the delayed write list.
  1447. *
  1448. * This queues a buffer for writeout if it hasn't already been. Note that
  1449. * neither this routine nor the buffer list submission functions perform
  1450. * any internal synchronization. It is expected that the lists are thread-local
  1451. * to the callers.
  1452. *
  1453. * Returns true if we queued up the buffer, or false if it already had
  1454. * been on the buffer list.
  1455. */
  1456. bool
  1457. xfs_buf_delwri_queue(
  1458. struct xfs_buf *bp,
  1459. struct list_head *list)
  1460. {
  1461. ASSERT(xfs_buf_islocked(bp));
  1462. ASSERT(!(bp->b_flags & XBF_READ));
  1463. /*
  1464. * If the buffer is already marked delwri it already is queued up
  1465. * by someone else for imediate writeout. Just ignore it in that
  1466. * case.
  1467. */
  1468. if (bp->b_flags & _XBF_DELWRI_Q) {
  1469. trace_xfs_buf_delwri_queued(bp, _RET_IP_);
  1470. return false;
  1471. }
  1472. trace_xfs_buf_delwri_queue(bp, _RET_IP_);
  1473. /*
  1474. * If a buffer gets written out synchronously or marked stale while it
  1475. * is on a delwri list we lazily remove it. To do this, the other party
  1476. * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
  1477. * It remains referenced and on the list. In a rare corner case it
  1478. * might get readded to a delwri list after the synchronous writeout, in
  1479. * which case we need just need to re-add the flag here.
  1480. */
  1481. bp->b_flags |= _XBF_DELWRI_Q;
  1482. if (list_empty(&bp->b_list)) {
  1483. atomic_inc(&bp->b_hold);
  1484. list_add_tail(&bp->b_list, list);
  1485. }
  1486. return true;
  1487. }
  1488. /*
  1489. * Compare function is more complex than it needs to be because
  1490. * the return value is only 32 bits and we are doing comparisons
  1491. * on 64 bit values
  1492. */
  1493. static int
  1494. xfs_buf_cmp(
  1495. void *priv,
  1496. struct list_head *a,
  1497. struct list_head *b)
  1498. {
  1499. struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
  1500. struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
  1501. xfs_daddr_t diff;
  1502. diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
  1503. if (diff < 0)
  1504. return -1;
  1505. if (diff > 0)
  1506. return 1;
  1507. return 0;
  1508. }
  1509. static int
  1510. __xfs_buf_delwri_submit(
  1511. struct list_head *buffer_list,
  1512. struct list_head *io_list,
  1513. bool wait)
  1514. {
  1515. struct blk_plug plug;
  1516. struct xfs_buf *bp, *n;
  1517. int pinned = 0;
  1518. list_for_each_entry_safe(bp, n, buffer_list, b_list) {
  1519. if (!wait) {
  1520. if (xfs_buf_ispinned(bp)) {
  1521. pinned++;
  1522. continue;
  1523. }
  1524. if (!xfs_buf_trylock(bp))
  1525. continue;
  1526. } else {
  1527. xfs_buf_lock(bp);
  1528. }
  1529. /*
  1530. * Someone else might have written the buffer synchronously or
  1531. * marked it stale in the meantime. In that case only the
  1532. * _XBF_DELWRI_Q flag got cleared, and we have to drop the
  1533. * reference and remove it from the list here.
  1534. */
  1535. if (!(bp->b_flags & _XBF_DELWRI_Q)) {
  1536. list_del_init(&bp->b_list);
  1537. xfs_buf_relse(bp);
  1538. continue;
  1539. }
  1540. list_move_tail(&bp->b_list, io_list);
  1541. trace_xfs_buf_delwri_split(bp, _RET_IP_);
  1542. }
  1543. list_sort(NULL, io_list, xfs_buf_cmp);
  1544. blk_start_plug(&plug);
  1545. list_for_each_entry_safe(bp, n, io_list, b_list) {
  1546. bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
  1547. bp->b_flags |= XBF_WRITE;
  1548. if (!wait) {
  1549. bp->b_flags |= XBF_ASYNC;
  1550. list_del_init(&bp->b_list);
  1551. }
  1552. xfs_bdstrat_cb(bp);
  1553. }
  1554. blk_finish_plug(&plug);
  1555. return pinned;
  1556. }
  1557. /*
  1558. * Write out a buffer list asynchronously.
  1559. *
  1560. * This will take the @buffer_list, write all non-locked and non-pinned buffers
  1561. * out and not wait for I/O completion on any of the buffers. This interface
  1562. * is only safely useable for callers that can track I/O completion by higher
  1563. * level means, e.g. AIL pushing as the @buffer_list is consumed in this
  1564. * function.
  1565. */
  1566. int
  1567. xfs_buf_delwri_submit_nowait(
  1568. struct list_head *buffer_list)
  1569. {
  1570. LIST_HEAD (io_list);
  1571. return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
  1572. }
  1573. /*
  1574. * Write out a buffer list synchronously.
  1575. *
  1576. * This will take the @buffer_list, write all buffers out and wait for I/O
  1577. * completion on all of the buffers. @buffer_list is consumed by the function,
  1578. * so callers must have some other way of tracking buffers if they require such
  1579. * functionality.
  1580. */
  1581. int
  1582. xfs_buf_delwri_submit(
  1583. struct list_head *buffer_list)
  1584. {
  1585. LIST_HEAD (io_list);
  1586. int error = 0, error2;
  1587. struct xfs_buf *bp;
  1588. __xfs_buf_delwri_submit(buffer_list, &io_list, true);
  1589. /* Wait for IO to complete. */
  1590. while (!list_empty(&io_list)) {
  1591. bp = list_first_entry(&io_list, struct xfs_buf, b_list);
  1592. list_del_init(&bp->b_list);
  1593. error2 = xfs_buf_iowait(bp);
  1594. xfs_buf_relse(bp);
  1595. if (!error)
  1596. error = error2;
  1597. }
  1598. return error;
  1599. }
  1600. int __init
  1601. xfs_buf_init(void)
  1602. {
  1603. xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
  1604. KM_ZONE_HWALIGN, NULL);
  1605. if (!xfs_buf_zone)
  1606. goto out;
  1607. xfslogd_workqueue = alloc_workqueue("xfslogd",
  1608. WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
  1609. if (!xfslogd_workqueue)
  1610. goto out_free_buf_zone;
  1611. return 0;
  1612. out_free_buf_zone:
  1613. kmem_zone_destroy(xfs_buf_zone);
  1614. out:
  1615. return -ENOMEM;
  1616. }
  1617. void
  1618. xfs_buf_terminate(void)
  1619. {
  1620. destroy_workqueue(xfslogd_workqueue);
  1621. kmem_zone_destroy(xfs_buf_zone);
  1622. }