balloc.c 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591
  1. /*
  2. * linux/fs/ext3/balloc.c
  3. *
  4. * Copyright (C) 1992, 1993, 1994, 1995
  5. * Remy Card (card@masi.ibp.fr)
  6. * Laboratoire MASI - Institut Blaise Pascal
  7. * Universite Pierre et Marie Curie (Paris VI)
  8. *
  9. * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
  10. * Big-endian to little-endian byte-swapping/bitmaps by
  11. * David S. Miller (davem@caip.rutgers.edu), 1995
  12. */
  13. #include <linux/config.h>
  14. #include <linux/time.h>
  15. #include <linux/fs.h>
  16. #include <linux/jbd.h>
  17. #include <linux/ext3_fs.h>
  18. #include <linux/ext3_jbd.h>
  19. #include <linux/quotaops.h>
  20. #include <linux/buffer_head.h>
  21. /*
  22. * balloc.c contains the blocks allocation and deallocation routines
  23. */
  24. /*
  25. * The free blocks are managed by bitmaps. A file system contains several
  26. * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
  27. * block for inodes, N blocks for the inode table and data blocks.
  28. *
  29. * The file system contains group descriptors which are located after the
  30. * super block. Each descriptor contains the number of the bitmap block and
  31. * the free blocks count in the block. The descriptors are loaded in memory
  32. * when a file system is mounted (see ext3_read_super).
  33. */
  34. #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
  35. struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
  36. unsigned int block_group,
  37. struct buffer_head ** bh)
  38. {
  39. unsigned long group_desc;
  40. unsigned long offset;
  41. struct ext3_group_desc * desc;
  42. struct ext3_sb_info *sbi = EXT3_SB(sb);
  43. if (block_group >= sbi->s_groups_count) {
  44. ext3_error (sb, "ext3_get_group_desc",
  45. "block_group >= groups_count - "
  46. "block_group = %d, groups_count = %lu",
  47. block_group, sbi->s_groups_count);
  48. return NULL;
  49. }
  50. smp_rmb();
  51. group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
  52. offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
  53. if (!sbi->s_group_desc[group_desc]) {
  54. ext3_error (sb, "ext3_get_group_desc",
  55. "Group descriptor not loaded - "
  56. "block_group = %d, group_desc = %lu, desc = %lu",
  57. block_group, group_desc, offset);
  58. return NULL;
  59. }
  60. desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
  61. if (bh)
  62. *bh = sbi->s_group_desc[group_desc];
  63. return desc + offset;
  64. }
  65. /*
  66. * Read the bitmap for a given block_group, reading into the specified
  67. * slot in the superblock's bitmap cache.
  68. *
  69. * Return buffer_head on success or NULL in case of failure.
  70. */
  71. static struct buffer_head *
  72. read_block_bitmap(struct super_block *sb, unsigned int block_group)
  73. {
  74. struct ext3_group_desc * desc;
  75. struct buffer_head * bh = NULL;
  76. desc = ext3_get_group_desc (sb, block_group, NULL);
  77. if (!desc)
  78. goto error_out;
  79. bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
  80. if (!bh)
  81. ext3_error (sb, "read_block_bitmap",
  82. "Cannot read block bitmap - "
  83. "block_group = %d, block_bitmap = %u",
  84. block_group, le32_to_cpu(desc->bg_block_bitmap));
  85. error_out:
  86. return bh;
  87. }
  88. /*
  89. * The reservation window structure operations
  90. * --------------------------------------------
  91. * Operations include:
  92. * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
  93. *
  94. * We use sorted double linked list for the per-filesystem reservation
  95. * window list. (like in vm_region).
  96. *
  97. * Initially, we keep those small operations in the abstract functions,
  98. * so later if we need a better searching tree than double linked-list,
  99. * we could easily switch to that without changing too much
  100. * code.
  101. */
  102. #if 0
  103. static void __rsv_window_dump(struct rb_root *root, int verbose,
  104. const char *fn)
  105. {
  106. struct rb_node *n;
  107. struct ext3_reserve_window_node *rsv, *prev;
  108. int bad;
  109. restart:
  110. n = rb_first(root);
  111. bad = 0;
  112. prev = NULL;
  113. printk("Block Allocation Reservation Windows Map (%s):\n", fn);
  114. while (n) {
  115. rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
  116. if (verbose)
  117. printk("reservation window 0x%p "
  118. "start: %d, end: %d\n",
  119. rsv, rsv->rsv_start, rsv->rsv_end);
  120. if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
  121. printk("Bad reservation %p (start >= end)\n",
  122. rsv);
  123. bad = 1;
  124. }
  125. if (prev && prev->rsv_end >= rsv->rsv_start) {
  126. printk("Bad reservation %p (prev->end >= start)\n",
  127. rsv);
  128. bad = 1;
  129. }
  130. if (bad) {
  131. if (!verbose) {
  132. printk("Restarting reservation walk in verbose mode\n");
  133. verbose = 1;
  134. goto restart;
  135. }
  136. }
  137. n = rb_next(n);
  138. prev = rsv;
  139. }
  140. printk("Window map complete.\n");
  141. if (bad)
  142. BUG();
  143. }
  144. #define rsv_window_dump(root, verbose) \
  145. __rsv_window_dump((root), (verbose), __FUNCTION__)
  146. #else
  147. #define rsv_window_dump(root, verbose) do {} while (0)
  148. #endif
  149. static int
  150. goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
  151. unsigned int group, struct super_block * sb)
  152. {
  153. unsigned long group_first_block, group_last_block;
  154. group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
  155. group * EXT3_BLOCKS_PER_GROUP(sb);
  156. group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
  157. if ((rsv->_rsv_start > group_last_block) ||
  158. (rsv->_rsv_end < group_first_block))
  159. return 0;
  160. if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
  161. || (goal + group_first_block > rsv->_rsv_end)))
  162. return 0;
  163. return 1;
  164. }
  165. /*
  166. * Find the reserved window which includes the goal, or the previous one
  167. * if the goal is not in any window.
  168. * Returns NULL if there are no windows or if all windows start after the goal.
  169. */
  170. static struct ext3_reserve_window_node *
  171. search_reserve_window(struct rb_root *root, unsigned long goal)
  172. {
  173. struct rb_node *n = root->rb_node;
  174. struct ext3_reserve_window_node *rsv;
  175. if (!n)
  176. return NULL;
  177. do {
  178. rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
  179. if (goal < rsv->rsv_start)
  180. n = n->rb_left;
  181. else if (goal > rsv->rsv_end)
  182. n = n->rb_right;
  183. else
  184. return rsv;
  185. } while (n);
  186. /*
  187. * We've fallen off the end of the tree: the goal wasn't inside
  188. * any particular node. OK, the previous node must be to one
  189. * side of the interval containing the goal. If it's the RHS,
  190. * we need to back up one.
  191. */
  192. if (rsv->rsv_start > goal) {
  193. n = rb_prev(&rsv->rsv_node);
  194. rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
  195. }
  196. return rsv;
  197. }
  198. void ext3_rsv_window_add(struct super_block *sb,
  199. struct ext3_reserve_window_node *rsv)
  200. {
  201. struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
  202. struct rb_node *node = &rsv->rsv_node;
  203. unsigned int start = rsv->rsv_start;
  204. struct rb_node ** p = &root->rb_node;
  205. struct rb_node * parent = NULL;
  206. struct ext3_reserve_window_node *this;
  207. while (*p)
  208. {
  209. parent = *p;
  210. this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
  211. if (start < this->rsv_start)
  212. p = &(*p)->rb_left;
  213. else if (start > this->rsv_end)
  214. p = &(*p)->rb_right;
  215. else
  216. BUG();
  217. }
  218. rb_link_node(node, parent, p);
  219. rb_insert_color(node, root);
  220. }
  221. static void rsv_window_remove(struct super_block *sb,
  222. struct ext3_reserve_window_node *rsv)
  223. {
  224. rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
  225. rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
  226. rsv->rsv_alloc_hit = 0;
  227. rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
  228. }
  229. static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
  230. {
  231. /* a valid reservation end block could not be 0 */
  232. return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED);
  233. }
  234. void ext3_init_block_alloc_info(struct inode *inode)
  235. {
  236. struct ext3_inode_info *ei = EXT3_I(inode);
  237. struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
  238. struct super_block *sb = inode->i_sb;
  239. block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
  240. if (block_i) {
  241. struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
  242. rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
  243. rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
  244. /*
  245. * if filesystem is mounted with NORESERVATION, the goal
  246. * reservation window size is set to zero to indicate
  247. * block reservation is off
  248. */
  249. if (!test_opt(sb, RESERVATION))
  250. rsv->rsv_goal_size = 0;
  251. else
  252. rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
  253. rsv->rsv_alloc_hit = 0;
  254. block_i->last_alloc_logical_block = 0;
  255. block_i->last_alloc_physical_block = 0;
  256. }
  257. ei->i_block_alloc_info = block_i;
  258. }
  259. void ext3_discard_reservation(struct inode *inode)
  260. {
  261. struct ext3_inode_info *ei = EXT3_I(inode);
  262. struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
  263. struct ext3_reserve_window_node *rsv;
  264. spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
  265. if (!block_i)
  266. return;
  267. rsv = &block_i->rsv_window_node;
  268. if (!rsv_is_empty(&rsv->rsv_window)) {
  269. spin_lock(rsv_lock);
  270. if (!rsv_is_empty(&rsv->rsv_window))
  271. rsv_window_remove(inode->i_sb, rsv);
  272. spin_unlock(rsv_lock);
  273. }
  274. }
  275. /* Free given blocks, update quota and i_blocks field */
  276. void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
  277. unsigned long block, unsigned long count,
  278. int *pdquot_freed_blocks)
  279. {
  280. struct buffer_head *bitmap_bh = NULL;
  281. struct buffer_head *gd_bh;
  282. unsigned long block_group;
  283. unsigned long bit;
  284. unsigned long i;
  285. unsigned long overflow;
  286. struct ext3_group_desc * desc;
  287. struct ext3_super_block * es;
  288. struct ext3_sb_info *sbi;
  289. int err = 0, ret;
  290. unsigned group_freed;
  291. *pdquot_freed_blocks = 0;
  292. sbi = EXT3_SB(sb);
  293. es = sbi->s_es;
  294. if (block < le32_to_cpu(es->s_first_data_block) ||
  295. block + count < block ||
  296. block + count > le32_to_cpu(es->s_blocks_count)) {
  297. ext3_error (sb, "ext3_free_blocks",
  298. "Freeing blocks not in datazone - "
  299. "block = %lu, count = %lu", block, count);
  300. goto error_return;
  301. }
  302. ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
  303. do_more:
  304. overflow = 0;
  305. block_group = (block - le32_to_cpu(es->s_first_data_block)) /
  306. EXT3_BLOCKS_PER_GROUP(sb);
  307. bit = (block - le32_to_cpu(es->s_first_data_block)) %
  308. EXT3_BLOCKS_PER_GROUP(sb);
  309. /*
  310. * Check to see if we are freeing blocks across a group
  311. * boundary.
  312. */
  313. if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
  314. overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
  315. count -= overflow;
  316. }
  317. brelse(bitmap_bh);
  318. bitmap_bh = read_block_bitmap(sb, block_group);
  319. if (!bitmap_bh)
  320. goto error_return;
  321. desc = ext3_get_group_desc (sb, block_group, &gd_bh);
  322. if (!desc)
  323. goto error_return;
  324. if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
  325. in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
  326. in_range (block, le32_to_cpu(desc->bg_inode_table),
  327. sbi->s_itb_per_group) ||
  328. in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
  329. sbi->s_itb_per_group))
  330. ext3_error (sb, "ext3_free_blocks",
  331. "Freeing blocks in system zones - "
  332. "Block = %lu, count = %lu",
  333. block, count);
  334. /*
  335. * We are about to start releasing blocks in the bitmap,
  336. * so we need undo access.
  337. */
  338. /* @@@ check errors */
  339. BUFFER_TRACE(bitmap_bh, "getting undo access");
  340. err = ext3_journal_get_undo_access(handle, bitmap_bh);
  341. if (err)
  342. goto error_return;
  343. /*
  344. * We are about to modify some metadata. Call the journal APIs
  345. * to unshare ->b_data if a currently-committing transaction is
  346. * using it
  347. */
  348. BUFFER_TRACE(gd_bh, "get_write_access");
  349. err = ext3_journal_get_write_access(handle, gd_bh);
  350. if (err)
  351. goto error_return;
  352. jbd_lock_bh_state(bitmap_bh);
  353. for (i = 0, group_freed = 0; i < count; i++) {
  354. /*
  355. * An HJ special. This is expensive...
  356. */
  357. #ifdef CONFIG_JBD_DEBUG
  358. jbd_unlock_bh_state(bitmap_bh);
  359. {
  360. struct buffer_head *debug_bh;
  361. debug_bh = sb_find_get_block(sb, block + i);
  362. if (debug_bh) {
  363. BUFFER_TRACE(debug_bh, "Deleted!");
  364. if (!bh2jh(bitmap_bh)->b_committed_data)
  365. BUFFER_TRACE(debug_bh,
  366. "No commited data in bitmap");
  367. BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
  368. __brelse(debug_bh);
  369. }
  370. }
  371. jbd_lock_bh_state(bitmap_bh);
  372. #endif
  373. if (need_resched()) {
  374. jbd_unlock_bh_state(bitmap_bh);
  375. cond_resched();
  376. jbd_lock_bh_state(bitmap_bh);
  377. }
  378. /* @@@ This prevents newly-allocated data from being
  379. * freed and then reallocated within the same
  380. * transaction.
  381. *
  382. * Ideally we would want to allow that to happen, but to
  383. * do so requires making journal_forget() capable of
  384. * revoking the queued write of a data block, which
  385. * implies blocking on the journal lock. *forget()
  386. * cannot block due to truncate races.
  387. *
  388. * Eventually we can fix this by making journal_forget()
  389. * return a status indicating whether or not it was able
  390. * to revoke the buffer. On successful revoke, it is
  391. * safe not to set the allocation bit in the committed
  392. * bitmap, because we know that there is no outstanding
  393. * activity on the buffer any more and so it is safe to
  394. * reallocate it.
  395. */
  396. BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
  397. J_ASSERT_BH(bitmap_bh,
  398. bh2jh(bitmap_bh)->b_committed_data != NULL);
  399. ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
  400. bh2jh(bitmap_bh)->b_committed_data);
  401. /*
  402. * We clear the bit in the bitmap after setting the committed
  403. * data bit, because this is the reverse order to that which
  404. * the allocator uses.
  405. */
  406. BUFFER_TRACE(bitmap_bh, "clear bit");
  407. if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
  408. bit + i, bitmap_bh->b_data)) {
  409. jbd_unlock_bh_state(bitmap_bh);
  410. ext3_error(sb, __FUNCTION__,
  411. "bit already cleared for block %lu", block + i);
  412. jbd_lock_bh_state(bitmap_bh);
  413. BUFFER_TRACE(bitmap_bh, "bit already cleared");
  414. } else {
  415. group_freed++;
  416. }
  417. }
  418. jbd_unlock_bh_state(bitmap_bh);
  419. spin_lock(sb_bgl_lock(sbi, block_group));
  420. desc->bg_free_blocks_count =
  421. cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
  422. group_freed);
  423. spin_unlock(sb_bgl_lock(sbi, block_group));
  424. percpu_counter_mod(&sbi->s_freeblocks_counter, count);
  425. /* We dirtied the bitmap block */
  426. BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
  427. err = ext3_journal_dirty_metadata(handle, bitmap_bh);
  428. /* And the group descriptor block */
  429. BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
  430. ret = ext3_journal_dirty_metadata(handle, gd_bh);
  431. if (!err) err = ret;
  432. *pdquot_freed_blocks += group_freed;
  433. if (overflow && !err) {
  434. block += count;
  435. count = overflow;
  436. goto do_more;
  437. }
  438. sb->s_dirt = 1;
  439. error_return:
  440. brelse(bitmap_bh);
  441. ext3_std_error(sb, err);
  442. return;
  443. }
  444. /* Free given blocks, update quota and i_blocks field */
  445. void ext3_free_blocks(handle_t *handle, struct inode *inode,
  446. unsigned long block, unsigned long count)
  447. {
  448. struct super_block * sb;
  449. int dquot_freed_blocks;
  450. sb = inode->i_sb;
  451. if (!sb) {
  452. printk ("ext3_free_blocks: nonexistent device");
  453. return;
  454. }
  455. ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
  456. if (dquot_freed_blocks)
  457. DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
  458. return;
  459. }
  460. /*
  461. * For ext3 allocations, we must not reuse any blocks which are
  462. * allocated in the bitmap buffer's "last committed data" copy. This
  463. * prevents deletes from freeing up the page for reuse until we have
  464. * committed the delete transaction.
  465. *
  466. * If we didn't do this, then deleting something and reallocating it as
  467. * data would allow the old block to be overwritten before the
  468. * transaction committed (because we force data to disk before commit).
  469. * This would lead to corruption if we crashed between overwriting the
  470. * data and committing the delete.
  471. *
  472. * @@@ We may want to make this allocation behaviour conditional on
  473. * data-writes at some point, and disable it for metadata allocations or
  474. * sync-data inodes.
  475. */
  476. static int ext3_test_allocatable(int nr, struct buffer_head *bh)
  477. {
  478. int ret;
  479. struct journal_head *jh = bh2jh(bh);
  480. if (ext3_test_bit(nr, bh->b_data))
  481. return 0;
  482. jbd_lock_bh_state(bh);
  483. if (!jh->b_committed_data)
  484. ret = 1;
  485. else
  486. ret = !ext3_test_bit(nr, jh->b_committed_data);
  487. jbd_unlock_bh_state(bh);
  488. return ret;
  489. }
  490. static int
  491. bitmap_search_next_usable_block(int start, struct buffer_head *bh,
  492. int maxblocks)
  493. {
  494. int next;
  495. struct journal_head *jh = bh2jh(bh);
  496. /*
  497. * The bitmap search --- search forward alternately through the actual
  498. * bitmap and the last-committed copy until we find a bit free in
  499. * both
  500. */
  501. while (start < maxblocks) {
  502. next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
  503. if (next >= maxblocks)
  504. return -1;
  505. if (ext3_test_allocatable(next, bh))
  506. return next;
  507. jbd_lock_bh_state(bh);
  508. if (jh->b_committed_data)
  509. start = ext3_find_next_zero_bit(jh->b_committed_data,
  510. maxblocks, next);
  511. jbd_unlock_bh_state(bh);
  512. }
  513. return -1;
  514. }
  515. /*
  516. * Find an allocatable block in a bitmap. We honour both the bitmap and
  517. * its last-committed copy (if that exists), and perform the "most
  518. * appropriate allocation" algorithm of looking for a free block near
  519. * the initial goal; then for a free byte somewhere in the bitmap; then
  520. * for any free bit in the bitmap.
  521. */
  522. static int
  523. find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
  524. {
  525. int here, next;
  526. char *p, *r;
  527. if (start > 0) {
  528. /*
  529. * The goal was occupied; search forward for a free
  530. * block within the next XX blocks.
  531. *
  532. * end_goal is more or less random, but it has to be
  533. * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
  534. * next 64-bit boundary is simple..
  535. */
  536. int end_goal = (start + 63) & ~63;
  537. if (end_goal > maxblocks)
  538. end_goal = maxblocks;
  539. here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
  540. if (here < end_goal && ext3_test_allocatable(here, bh))
  541. return here;
  542. ext3_debug("Bit not found near goal\n");
  543. }
  544. here = start;
  545. if (here < 0)
  546. here = 0;
  547. p = ((char *)bh->b_data) + (here >> 3);
  548. r = memscan(p, 0, (maxblocks - here + 7) >> 3);
  549. next = (r - ((char *)bh->b_data)) << 3;
  550. if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
  551. return next;
  552. /*
  553. * The bitmap search --- search forward alternately through the actual
  554. * bitmap and the last-committed copy until we find a bit free in
  555. * both
  556. */
  557. here = bitmap_search_next_usable_block(here, bh, maxblocks);
  558. return here;
  559. }
  560. /*
  561. * We think we can allocate this block in this bitmap. Try to set the bit.
  562. * If that succeeds then check that nobody has allocated and then freed the
  563. * block since we saw that is was not marked in b_committed_data. If it _was_
  564. * allocated and freed then clear the bit in the bitmap again and return
  565. * zero (failure).
  566. */
  567. static inline int
  568. claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
  569. {
  570. struct journal_head *jh = bh2jh(bh);
  571. int ret;
  572. if (ext3_set_bit_atomic(lock, block, bh->b_data))
  573. return 0;
  574. jbd_lock_bh_state(bh);
  575. if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
  576. ext3_clear_bit_atomic(lock, block, bh->b_data);
  577. ret = 0;
  578. } else {
  579. ret = 1;
  580. }
  581. jbd_unlock_bh_state(bh);
  582. return ret;
  583. }
  584. /*
  585. * If we failed to allocate the desired block then we may end up crossing to a
  586. * new bitmap. In that case we must release write access to the old one via
  587. * ext3_journal_release_buffer(), else we'll run out of credits.
  588. */
  589. static int
  590. ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
  591. struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv)
  592. {
  593. int group_first_block, start, end;
  594. /* we do allocation within the reservation window if we have a window */
  595. if (my_rsv) {
  596. group_first_block =
  597. le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
  598. group * EXT3_BLOCKS_PER_GROUP(sb);
  599. if (my_rsv->_rsv_start >= group_first_block)
  600. start = my_rsv->_rsv_start - group_first_block;
  601. else
  602. /* reservation window cross group boundary */
  603. start = 0;
  604. end = my_rsv->_rsv_end - group_first_block + 1;
  605. if (end > EXT3_BLOCKS_PER_GROUP(sb))
  606. /* reservation window crosses group boundary */
  607. end = EXT3_BLOCKS_PER_GROUP(sb);
  608. if ((start <= goal) && (goal < end))
  609. start = goal;
  610. else
  611. goal = -1;
  612. } else {
  613. if (goal > 0)
  614. start = goal;
  615. else
  616. start = 0;
  617. end = EXT3_BLOCKS_PER_GROUP(sb);
  618. }
  619. BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
  620. repeat:
  621. if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
  622. goal = find_next_usable_block(start, bitmap_bh, end);
  623. if (goal < 0)
  624. goto fail_access;
  625. if (!my_rsv) {
  626. int i;
  627. for (i = 0; i < 7 && goal > start &&
  628. ext3_test_allocatable(goal - 1,
  629. bitmap_bh);
  630. i++, goal--)
  631. ;
  632. }
  633. }
  634. start = goal;
  635. if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
  636. /*
  637. * The block was allocated by another thread, or it was
  638. * allocated and then freed by another thread
  639. */
  640. start++;
  641. goal++;
  642. if (start >= end)
  643. goto fail_access;
  644. goto repeat;
  645. }
  646. return goal;
  647. fail_access:
  648. return -1;
  649. }
  650. /**
  651. * find_next_reservable_window():
  652. * find a reservable space within the given range.
  653. * It does not allocate the reservation window for now:
  654. * alloc_new_reservation() will do the work later.
  655. *
  656. * @search_head: the head of the searching list;
  657. * This is not necessarily the list head of the whole filesystem
  658. *
  659. * We have both head and start_block to assist the search
  660. * for the reservable space. The list starts from head,
  661. * but we will shift to the place where start_block is,
  662. * then start from there, when looking for a reservable space.
  663. *
  664. * @size: the target new reservation window size
  665. *
  666. * @group_first_block: the first block we consider to start
  667. * the real search from
  668. *
  669. * @last_block:
  670. * the maximum block number that our goal reservable space
  671. * could start from. This is normally the last block in this
  672. * group. The search will end when we found the start of next
  673. * possible reservable space is out of this boundary.
  674. * This could handle the cross boundary reservation window
  675. * request.
  676. *
  677. * basically we search from the given range, rather than the whole
  678. * reservation double linked list, (start_block, last_block)
  679. * to find a free region that is of my size and has not
  680. * been reserved.
  681. *
  682. */
  683. static int find_next_reservable_window(
  684. struct ext3_reserve_window_node *search_head,
  685. struct ext3_reserve_window_node *my_rsv,
  686. struct super_block * sb, int start_block,
  687. int last_block)
  688. {
  689. struct rb_node *next;
  690. struct ext3_reserve_window_node *rsv, *prev;
  691. int cur;
  692. int size = my_rsv->rsv_goal_size;
  693. /* TODO: make the start of the reservation window byte-aligned */
  694. /* cur = *start_block & ~7;*/
  695. cur = start_block;
  696. rsv = search_head;
  697. if (!rsv)
  698. return -1;
  699. while (1) {
  700. if (cur <= rsv->rsv_end)
  701. cur = rsv->rsv_end + 1;
  702. /* TODO?
  703. * in the case we could not find a reservable space
  704. * that is what is expected, during the re-search, we could
  705. * remember what's the largest reservable space we could have
  706. * and return that one.
  707. *
  708. * For now it will fail if we could not find the reservable
  709. * space with expected-size (or more)...
  710. */
  711. if (cur > last_block)
  712. return -1; /* fail */
  713. prev = rsv;
  714. next = rb_next(&rsv->rsv_node);
  715. rsv = list_entry(next,struct ext3_reserve_window_node,rsv_node);
  716. /*
  717. * Reached the last reservation, we can just append to the
  718. * previous one.
  719. */
  720. if (!next)
  721. break;
  722. if (cur + size <= rsv->rsv_start) {
  723. /*
  724. * Found a reserveable space big enough. We could
  725. * have a reservation across the group boundary here
  726. */
  727. break;
  728. }
  729. }
  730. /*
  731. * we come here either :
  732. * when we reach the end of the whole list,
  733. * and there is empty reservable space after last entry in the list.
  734. * append it to the end of the list.
  735. *
  736. * or we found one reservable space in the middle of the list,
  737. * return the reservation window that we could append to.
  738. * succeed.
  739. */
  740. if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
  741. rsv_window_remove(sb, my_rsv);
  742. /*
  743. * Let's book the whole avaliable window for now. We will check the
  744. * disk bitmap later and then, if there are free blocks then we adjust
  745. * the window size if it's larger than requested.
  746. * Otherwise, we will remove this node from the tree next time
  747. * call find_next_reservable_window.
  748. */
  749. my_rsv->rsv_start = cur;
  750. my_rsv->rsv_end = cur + size - 1;
  751. my_rsv->rsv_alloc_hit = 0;
  752. if (prev != my_rsv)
  753. ext3_rsv_window_add(sb, my_rsv);
  754. return 0;
  755. }
  756. /**
  757. * alloc_new_reservation()--allocate a new reservation window
  758. *
  759. * To make a new reservation, we search part of the filesystem
  760. * reservation list (the list that inside the group). We try to
  761. * allocate a new reservation window near the allocation goal,
  762. * or the beginning of the group, if there is no goal.
  763. *
  764. * We first find a reservable space after the goal, then from
  765. * there, we check the bitmap for the first free block after
  766. * it. If there is no free block until the end of group, then the
  767. * whole group is full, we failed. Otherwise, check if the free
  768. * block is inside the expected reservable space, if so, we
  769. * succeed.
  770. * If the first free block is outside the reservable space, then
  771. * start from the first free block, we search for next available
  772. * space, and go on.
  773. *
  774. * on succeed, a new reservation will be found and inserted into the list
  775. * It contains at least one free block, and it does not overlap with other
  776. * reservation windows.
  777. *
  778. * failed: we failed to find a reservation window in this group
  779. *
  780. * @rsv: the reservation
  781. *
  782. * @goal: The goal (group-relative). It is where the search for a
  783. * free reservable space should start from.
  784. * if we have a goal(goal >0 ), then start from there,
  785. * no goal(goal = -1), we start from the first block
  786. * of the group.
  787. *
  788. * @sb: the super block
  789. * @group: the group we are trying to allocate in
  790. * @bitmap_bh: the block group block bitmap
  791. *
  792. */
  793. static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
  794. int goal, struct super_block *sb,
  795. unsigned int group, struct buffer_head *bitmap_bh)
  796. {
  797. struct ext3_reserve_window_node *search_head;
  798. int group_first_block, group_end_block, start_block;
  799. int first_free_block;
  800. struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
  801. unsigned long size;
  802. int ret;
  803. spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
  804. group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
  805. group * EXT3_BLOCKS_PER_GROUP(sb);
  806. group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
  807. if (goal < 0)
  808. start_block = group_first_block;
  809. else
  810. start_block = goal + group_first_block;
  811. size = my_rsv->rsv_goal_size;
  812. if (!rsv_is_empty(&my_rsv->rsv_window)) {
  813. /*
  814. * if the old reservation is cross group boundary
  815. * and if the goal is inside the old reservation window,
  816. * we will come here when we just failed to allocate from
  817. * the first part of the window. We still have another part
  818. * that belongs to the next group. In this case, there is no
  819. * point to discard our window and try to allocate a new one
  820. * in this group(which will fail). we should
  821. * keep the reservation window, just simply move on.
  822. *
  823. * Maybe we could shift the start block of the reservation
  824. * window to the first block of next group.
  825. */
  826. if ((my_rsv->rsv_start <= group_end_block) &&
  827. (my_rsv->rsv_end > group_end_block) &&
  828. (start_block >= my_rsv->rsv_start))
  829. return -1;
  830. if ((my_rsv->rsv_alloc_hit >
  831. (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
  832. /*
  833. * if we previously allocation hit ration is greater than half
  834. * we double the size of reservation window next time
  835. * otherwise keep the same
  836. */
  837. size = size * 2;
  838. if (size > EXT3_MAX_RESERVE_BLOCKS)
  839. size = EXT3_MAX_RESERVE_BLOCKS;
  840. my_rsv->rsv_goal_size= size;
  841. }
  842. }
  843. spin_lock(rsv_lock);
  844. /*
  845. * shift the search start to the window near the goal block
  846. */
  847. search_head = search_reserve_window(fs_rsv_root, start_block);
  848. /*
  849. * find_next_reservable_window() simply finds a reservable window
  850. * inside the given range(start_block, group_end_block).
  851. *
  852. * To make sure the reservation window has a free bit inside it, we
  853. * need to check the bitmap after we found a reservable window.
  854. */
  855. retry:
  856. ret = find_next_reservable_window(search_head, my_rsv, sb,
  857. start_block, group_end_block);
  858. if (ret == -1) {
  859. if (!rsv_is_empty(&my_rsv->rsv_window))
  860. rsv_window_remove(sb, my_rsv);
  861. spin_unlock(rsv_lock);
  862. return -1;
  863. }
  864. /*
  865. * On success, find_next_reservable_window() returns the
  866. * reservation window where there is a reservable space after it.
  867. * Before we reserve this reservable space, we need
  868. * to make sure there is at least a free block inside this region.
  869. *
  870. * searching the first free bit on the block bitmap and copy of
  871. * last committed bitmap alternatively, until we found a allocatable
  872. * block. Search start from the start block of the reservable space
  873. * we just found.
  874. */
  875. spin_unlock(rsv_lock);
  876. first_free_block = bitmap_search_next_usable_block(
  877. my_rsv->rsv_start - group_first_block,
  878. bitmap_bh, group_end_block - group_first_block + 1);
  879. if (first_free_block < 0) {
  880. /*
  881. * no free block left on the bitmap, no point
  882. * to reserve the space. return failed.
  883. */
  884. spin_lock(rsv_lock);
  885. if (!rsv_is_empty(&my_rsv->rsv_window))
  886. rsv_window_remove(sb, my_rsv);
  887. spin_unlock(rsv_lock);
  888. return -1; /* failed */
  889. }
  890. start_block = first_free_block + group_first_block;
  891. /*
  892. * check if the first free block is within the
  893. * free space we just reserved
  894. */
  895. if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
  896. return 0; /* success */
  897. /*
  898. * if the first free bit we found is out of the reservable space
  899. * continue search for next reservable space,
  900. * start from where the free block is,
  901. * we also shift the list head to where we stopped last time
  902. */
  903. search_head = my_rsv;
  904. spin_lock(rsv_lock);
  905. goto retry;
  906. }
  907. /*
  908. * This is the main function used to allocate a new block and its reservation
  909. * window.
  910. *
  911. * Each time when a new block allocation is need, first try to allocate from
  912. * its own reservation. If it does not have a reservation window, instead of
  913. * looking for a free bit on bitmap first, then look up the reservation list to
  914. * see if it is inside somebody else's reservation window, we try to allocate a
  915. * reservation window for it starting from the goal first. Then do the block
  916. * allocation within the reservation window.
  917. *
  918. * This will avoid keeping on searching the reservation list again and
  919. * again when someboday is looking for a free block (without
  920. * reservation), and there are lots of free blocks, but they are all
  921. * being reserved.
  922. *
  923. * We use a sorted double linked list for the per-filesystem reservation list.
  924. * The insert, remove and find a free space(non-reserved) operations for the
  925. * sorted double linked list should be fast.
  926. *
  927. */
  928. static int
  929. ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
  930. unsigned int group, struct buffer_head *bitmap_bh,
  931. int goal, struct ext3_reserve_window_node * my_rsv,
  932. int *errp)
  933. {
  934. unsigned long group_first_block;
  935. int ret = 0;
  936. int fatal;
  937. *errp = 0;
  938. /*
  939. * Make sure we use undo access for the bitmap, because it is critical
  940. * that we do the frozen_data COW on bitmap buffers in all cases even
  941. * if the buffer is in BJ_Forget state in the committing transaction.
  942. */
  943. BUFFER_TRACE(bitmap_bh, "get undo access for new block");
  944. fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
  945. if (fatal) {
  946. *errp = fatal;
  947. return -1;
  948. }
  949. /*
  950. * we don't deal with reservation when
  951. * filesystem is mounted without reservation
  952. * or the file is not a regular file
  953. * or last attempt to allocate a block with reservation turned on failed
  954. */
  955. if (my_rsv == NULL ) {
  956. ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL);
  957. goto out;
  958. }
  959. /*
  960. * goal is a group relative block number (if there is a goal)
  961. * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
  962. * first block is a filesystem wide block number
  963. * first block is the block number of the first block in this group
  964. */
  965. group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
  966. group * EXT3_BLOCKS_PER_GROUP(sb);
  967. /*
  968. * Basically we will allocate a new block from inode's reservation
  969. * window.
  970. *
  971. * We need to allocate a new reservation window, if:
  972. * a) inode does not have a reservation window; or
  973. * b) last attempt to allocate a block from existing reservation
  974. * failed; or
  975. * c) we come here with a goal and with a reservation window
  976. *
  977. * We do not need to allocate a new reservation window if we come here
  978. * at the beginning with a goal and the goal is inside the window, or
  979. * we don't have a goal but already have a reservation window.
  980. * then we could go to allocate from the reservation window directly.
  981. */
  982. while (1) {
  983. if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
  984. !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) {
  985. ret = alloc_new_reservation(my_rsv, goal, sb,
  986. group, bitmap_bh);
  987. if (ret < 0)
  988. break; /* failed */
  989. if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb))
  990. goal = -1;
  991. }
  992. if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
  993. || (my_rsv->rsv_end < group_first_block))
  994. BUG();
  995. ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
  996. &my_rsv->rsv_window);
  997. if (ret >= 0) {
  998. my_rsv->rsv_alloc_hit++;
  999. break; /* succeed */
  1000. }
  1001. }
  1002. out:
  1003. if (ret >= 0) {
  1004. BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
  1005. "bitmap block");
  1006. fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
  1007. if (fatal) {
  1008. *errp = fatal;
  1009. return -1;
  1010. }
  1011. return ret;
  1012. }
  1013. BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
  1014. ext3_journal_release_buffer(handle, bitmap_bh);
  1015. return ret;
  1016. }
  1017. static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
  1018. {
  1019. int free_blocks, root_blocks;
  1020. free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
  1021. root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
  1022. if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
  1023. sbi->s_resuid != current->fsuid &&
  1024. (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
  1025. return 0;
  1026. }
  1027. return 1;
  1028. }
  1029. /*
  1030. * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
  1031. * it is profitable to retry the operation, this function will wait
  1032. * for the current or commiting transaction to complete, and then
  1033. * return TRUE.
  1034. */
  1035. int ext3_should_retry_alloc(struct super_block *sb, int *retries)
  1036. {
  1037. if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
  1038. return 0;
  1039. jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
  1040. return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
  1041. }
  1042. /*
  1043. * ext3_new_block uses a goal block to assist allocation. If the goal is
  1044. * free, or there is a free block within 32 blocks of the goal, that block
  1045. * is allocated. Otherwise a forward search is made for a free block; within
  1046. * each block group the search first looks for an entire free byte in the block
  1047. * bitmap, and then for any free bit if that fails.
  1048. * This function also updates quota and i_blocks field.
  1049. */
  1050. int ext3_new_block(handle_t *handle, struct inode *inode,
  1051. unsigned long goal, int *errp)
  1052. {
  1053. struct buffer_head *bitmap_bh = NULL;
  1054. struct buffer_head *gdp_bh;
  1055. int group_no;
  1056. int goal_group;
  1057. int ret_block;
  1058. int bgi; /* blockgroup iteration index */
  1059. int target_block;
  1060. int fatal = 0, err;
  1061. int performed_allocation = 0;
  1062. int free_blocks;
  1063. struct super_block *sb;
  1064. struct ext3_group_desc *gdp;
  1065. struct ext3_super_block *es;
  1066. struct ext3_sb_info *sbi;
  1067. struct ext3_reserve_window_node *my_rsv = NULL;
  1068. struct ext3_block_alloc_info *block_i;
  1069. unsigned short windowsz = 0;
  1070. #ifdef EXT3FS_DEBUG
  1071. static int goal_hits, goal_attempts;
  1072. #endif
  1073. unsigned long ngroups;
  1074. *errp = -ENOSPC;
  1075. sb = inode->i_sb;
  1076. if (!sb) {
  1077. printk("ext3_new_block: nonexistent device");
  1078. return 0;
  1079. }
  1080. /*
  1081. * Check quota for allocation of this block.
  1082. */
  1083. if (DQUOT_ALLOC_BLOCK(inode, 1)) {
  1084. *errp = -EDQUOT;
  1085. return 0;
  1086. }
  1087. sbi = EXT3_SB(sb);
  1088. es = EXT3_SB(sb)->s_es;
  1089. ext3_debug("goal=%lu.\n", goal);
  1090. /*
  1091. * Allocate a block from reservation only when
  1092. * filesystem is mounted with reservation(default,-o reservation), and
  1093. * it's a regular file, and
  1094. * the desired window size is greater than 0 (One could use ioctl
  1095. * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
  1096. * reservation on that particular file)
  1097. */
  1098. block_i = EXT3_I(inode)->i_block_alloc_info;
  1099. if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
  1100. my_rsv = &block_i->rsv_window_node;
  1101. if (!ext3_has_free_blocks(sbi)) {
  1102. *errp = -ENOSPC;
  1103. goto out;
  1104. }
  1105. /*
  1106. * First, test whether the goal block is free.
  1107. */
  1108. if (goal < le32_to_cpu(es->s_first_data_block) ||
  1109. goal >= le32_to_cpu(es->s_blocks_count))
  1110. goal = le32_to_cpu(es->s_first_data_block);
  1111. group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
  1112. EXT3_BLOCKS_PER_GROUP(sb);
  1113. gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
  1114. if (!gdp)
  1115. goto io_error;
  1116. goal_group = group_no;
  1117. retry:
  1118. free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
  1119. /*
  1120. * if there is not enough free blocks to make a new resevation
  1121. * turn off reservation for this allocation
  1122. */
  1123. if (my_rsv && (free_blocks < windowsz)
  1124. && (rsv_is_empty(&my_rsv->rsv_window)))
  1125. my_rsv = NULL;
  1126. if (free_blocks > 0) {
  1127. ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
  1128. EXT3_BLOCKS_PER_GROUP(sb));
  1129. bitmap_bh = read_block_bitmap(sb, group_no);
  1130. if (!bitmap_bh)
  1131. goto io_error;
  1132. ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
  1133. bitmap_bh, ret_block, my_rsv, &fatal);
  1134. if (fatal)
  1135. goto out;
  1136. if (ret_block >= 0)
  1137. goto allocated;
  1138. }
  1139. ngroups = EXT3_SB(sb)->s_groups_count;
  1140. smp_rmb();
  1141. /*
  1142. * Now search the rest of the groups. We assume that
  1143. * i and gdp correctly point to the last group visited.
  1144. */
  1145. for (bgi = 0; bgi < ngroups; bgi++) {
  1146. group_no++;
  1147. if (group_no >= ngroups)
  1148. group_no = 0;
  1149. gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
  1150. if (!gdp) {
  1151. *errp = -EIO;
  1152. goto out;
  1153. }
  1154. free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
  1155. /*
  1156. * skip this group if the number of
  1157. * free blocks is less than half of the reservation
  1158. * window size.
  1159. */
  1160. if (free_blocks <= (windowsz/2))
  1161. continue;
  1162. brelse(bitmap_bh);
  1163. bitmap_bh = read_block_bitmap(sb, group_no);
  1164. if (!bitmap_bh)
  1165. goto io_error;
  1166. ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
  1167. bitmap_bh, -1, my_rsv, &fatal);
  1168. if (fatal)
  1169. goto out;
  1170. if (ret_block >= 0)
  1171. goto allocated;
  1172. }
  1173. /*
  1174. * We may end up a bogus ealier ENOSPC error due to
  1175. * filesystem is "full" of reservations, but
  1176. * there maybe indeed free blocks avaliable on disk
  1177. * In this case, we just forget about the reservations
  1178. * just do block allocation as without reservations.
  1179. */
  1180. if (my_rsv) {
  1181. my_rsv = NULL;
  1182. group_no = goal_group;
  1183. goto retry;
  1184. }
  1185. /* No space left on the device */
  1186. *errp = -ENOSPC;
  1187. goto out;
  1188. allocated:
  1189. ext3_debug("using block group %d(%d)\n",
  1190. group_no, gdp->bg_free_blocks_count);
  1191. BUFFER_TRACE(gdp_bh, "get_write_access");
  1192. fatal = ext3_journal_get_write_access(handle, gdp_bh);
  1193. if (fatal)
  1194. goto out;
  1195. target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
  1196. + le32_to_cpu(es->s_first_data_block);
  1197. if (target_block == le32_to_cpu(gdp->bg_block_bitmap) ||
  1198. target_block == le32_to_cpu(gdp->bg_inode_bitmap) ||
  1199. in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
  1200. EXT3_SB(sb)->s_itb_per_group))
  1201. ext3_error(sb, "ext3_new_block",
  1202. "Allocating block in system zone - "
  1203. "block = %u", target_block);
  1204. performed_allocation = 1;
  1205. #ifdef CONFIG_JBD_DEBUG
  1206. {
  1207. struct buffer_head *debug_bh;
  1208. /* Record bitmap buffer state in the newly allocated block */
  1209. debug_bh = sb_find_get_block(sb, target_block);
  1210. if (debug_bh) {
  1211. BUFFER_TRACE(debug_bh, "state when allocated");
  1212. BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
  1213. brelse(debug_bh);
  1214. }
  1215. }
  1216. jbd_lock_bh_state(bitmap_bh);
  1217. spin_lock(sb_bgl_lock(sbi, group_no));
  1218. if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
  1219. if (ext3_test_bit(ret_block,
  1220. bh2jh(bitmap_bh)->b_committed_data)) {
  1221. printk("%s: block was unexpectedly set in "
  1222. "b_committed_data\n", __FUNCTION__);
  1223. }
  1224. }
  1225. ext3_debug("found bit %d\n", ret_block);
  1226. spin_unlock(sb_bgl_lock(sbi, group_no));
  1227. jbd_unlock_bh_state(bitmap_bh);
  1228. #endif
  1229. /* ret_block was blockgroup-relative. Now it becomes fs-relative */
  1230. ret_block = target_block;
  1231. if (ret_block >= le32_to_cpu(es->s_blocks_count)) {
  1232. ext3_error(sb, "ext3_new_block",
  1233. "block(%d) >= blocks count(%d) - "
  1234. "block_group = %d, es == %p ", ret_block,
  1235. le32_to_cpu(es->s_blocks_count), group_no, es);
  1236. goto out;
  1237. }
  1238. /*
  1239. * It is up to the caller to add the new buffer to a journal
  1240. * list of some description. We don't know in advance whether
  1241. * the caller wants to use it as metadata or data.
  1242. */
  1243. ext3_debug("allocating block %d. Goal hits %d of %d.\n",
  1244. ret_block, goal_hits, goal_attempts);
  1245. spin_lock(sb_bgl_lock(sbi, group_no));
  1246. gdp->bg_free_blocks_count =
  1247. cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
  1248. spin_unlock(sb_bgl_lock(sbi, group_no));
  1249. percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
  1250. BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
  1251. err = ext3_journal_dirty_metadata(handle, gdp_bh);
  1252. if (!fatal)
  1253. fatal = err;
  1254. sb->s_dirt = 1;
  1255. if (fatal)
  1256. goto out;
  1257. *errp = 0;
  1258. brelse(bitmap_bh);
  1259. return ret_block;
  1260. io_error:
  1261. *errp = -EIO;
  1262. out:
  1263. if (fatal) {
  1264. *errp = fatal;
  1265. ext3_std_error(sb, fatal);
  1266. }
  1267. /*
  1268. * Undo the block allocation
  1269. */
  1270. if (!performed_allocation)
  1271. DQUOT_FREE_BLOCK(inode, 1);
  1272. brelse(bitmap_bh);
  1273. return 0;
  1274. }
  1275. unsigned long ext3_count_free_blocks(struct super_block *sb)
  1276. {
  1277. unsigned long desc_count;
  1278. struct ext3_group_desc *gdp;
  1279. int i;
  1280. unsigned long ngroups;
  1281. #ifdef EXT3FS_DEBUG
  1282. struct ext3_super_block *es;
  1283. unsigned long bitmap_count, x;
  1284. struct buffer_head *bitmap_bh = NULL;
  1285. lock_super(sb);
  1286. es = EXT3_SB(sb)->s_es;
  1287. desc_count = 0;
  1288. bitmap_count = 0;
  1289. gdp = NULL;
  1290. for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
  1291. gdp = ext3_get_group_desc(sb, i, NULL);
  1292. if (!gdp)
  1293. continue;
  1294. desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
  1295. brelse(bitmap_bh);
  1296. bitmap_bh = read_block_bitmap(sb, i);
  1297. if (bitmap_bh == NULL)
  1298. continue;
  1299. x = ext3_count_free(bitmap_bh, sb->s_blocksize);
  1300. printk("group %d: stored = %d, counted = %lu\n",
  1301. i, le16_to_cpu(gdp->bg_free_blocks_count), x);
  1302. bitmap_count += x;
  1303. }
  1304. brelse(bitmap_bh);
  1305. printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
  1306. le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
  1307. unlock_super(sb);
  1308. return bitmap_count;
  1309. #else
  1310. desc_count = 0;
  1311. ngroups = EXT3_SB(sb)->s_groups_count;
  1312. smp_rmb();
  1313. for (i = 0; i < ngroups; i++) {
  1314. gdp = ext3_get_group_desc(sb, i, NULL);
  1315. if (!gdp)
  1316. continue;
  1317. desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
  1318. }
  1319. return desc_count;
  1320. #endif
  1321. }
  1322. static inline int
  1323. block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
  1324. {
  1325. return ext3_test_bit ((block -
  1326. le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
  1327. EXT3_BLOCKS_PER_GROUP(sb), map);
  1328. }
  1329. static inline int test_root(int a, int b)
  1330. {
  1331. int num = b;
  1332. while (a > num)
  1333. num *= b;
  1334. return num == a;
  1335. }
  1336. static int ext3_group_sparse(int group)
  1337. {
  1338. if (group <= 1)
  1339. return 1;
  1340. if (!(group & 1))
  1341. return 0;
  1342. return (test_root(group, 7) || test_root(group, 5) ||
  1343. test_root(group, 3));
  1344. }
  1345. /**
  1346. * ext3_bg_has_super - number of blocks used by the superblock in group
  1347. * @sb: superblock for filesystem
  1348. * @group: group number to check
  1349. *
  1350. * Return the number of blocks used by the superblock (primary or backup)
  1351. * in this group. Currently this will be only 0 or 1.
  1352. */
  1353. int ext3_bg_has_super(struct super_block *sb, int group)
  1354. {
  1355. if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
  1356. !ext3_group_sparse(group))
  1357. return 0;
  1358. return 1;
  1359. }
  1360. /**
  1361. * ext3_bg_num_gdb - number of blocks used by the group table in group
  1362. * @sb: superblock for filesystem
  1363. * @group: group number to check
  1364. *
  1365. * Return the number of blocks used by the group descriptor table
  1366. * (primary or backup) in this group. In the future there may be a
  1367. * different number of descriptor blocks in each group.
  1368. */
  1369. unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
  1370. {
  1371. if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
  1372. !ext3_group_sparse(group))
  1373. return 0;
  1374. return EXT3_SB(sb)->s_gdb_count;
  1375. }
  1376. #ifdef CONFIG_EXT3_CHECK
  1377. /* Called at mount-time, super-block is locked */
  1378. void ext3_check_blocks_bitmap (struct super_block * sb)
  1379. {
  1380. struct ext3_super_block *es;
  1381. unsigned long desc_count, bitmap_count, x, j;
  1382. unsigned long desc_blocks;
  1383. struct buffer_head *bitmap_bh = NULL;
  1384. struct ext3_group_desc *gdp;
  1385. int i;
  1386. es = EXT3_SB(sb)->s_es;
  1387. desc_count = 0;
  1388. bitmap_count = 0;
  1389. gdp = NULL;
  1390. for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
  1391. gdp = ext3_get_group_desc (sb, i, NULL);
  1392. if (!gdp)
  1393. continue;
  1394. desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
  1395. brelse(bitmap_bh);
  1396. bitmap_bh = read_block_bitmap(sb, i);
  1397. if (bitmap_bh == NULL)
  1398. continue;
  1399. if (ext3_bg_has_super(sb, i) &&
  1400. !ext3_test_bit(0, bitmap_bh->b_data))
  1401. ext3_error(sb, __FUNCTION__,
  1402. "Superblock in group %d is marked free", i);
  1403. desc_blocks = ext3_bg_num_gdb(sb, i);
  1404. for (j = 0; j < desc_blocks; j++)
  1405. if (!ext3_test_bit(j + 1, bitmap_bh->b_data))
  1406. ext3_error(sb, __FUNCTION__,
  1407. "Descriptor block #%ld in group "
  1408. "%d is marked free", j, i);
  1409. if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
  1410. sb, bitmap_bh->b_data))
  1411. ext3_error (sb, "ext3_check_blocks_bitmap",
  1412. "Block bitmap for group %d is marked free",
  1413. i);
  1414. if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
  1415. sb, bitmap_bh->b_data))
  1416. ext3_error (sb, "ext3_check_blocks_bitmap",
  1417. "Inode bitmap for group %d is marked free",
  1418. i);
  1419. for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++)
  1420. if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
  1421. sb, bitmap_bh->b_data))
  1422. ext3_error (sb, "ext3_check_blocks_bitmap",
  1423. "Block #%d of the inode table in "
  1424. "group %d is marked free", j, i);
  1425. x = ext3_count_free(bitmap_bh, sb->s_blocksize);
  1426. if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
  1427. ext3_error (sb, "ext3_check_blocks_bitmap",
  1428. "Wrong free blocks count for group %d, "
  1429. "stored = %d, counted = %lu", i,
  1430. le16_to_cpu(gdp->bg_free_blocks_count), x);
  1431. bitmap_count += x;
  1432. }
  1433. brelse(bitmap_bh);
  1434. if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
  1435. ext3_error (sb, "ext3_check_blocks_bitmap",
  1436. "Wrong free blocks count in super block, "
  1437. "stored = %lu, counted = %lu",
  1438. (unsigned long)le32_to_cpu(es->s_free_blocks_count),
  1439. bitmap_count);
  1440. }
  1441. #endif