refcounttree.c 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155
  1. /* -*- mode: c; c-basic-offset: 8; -*-
  2. * vim: noexpandtab sw=8 ts=8 sts=0:
  3. *
  4. * refcounttree.c
  5. *
  6. * Copyright (C) 2009 Oracle. All rights reserved.
  7. *
  8. * This program is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU General Public
  10. * License version 2 as published by the Free Software Foundation.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * General Public License for more details.
  16. */
  17. #include <linux/sort.h>
  18. #define MLOG_MASK_PREFIX ML_REFCOUNT
  19. #include <cluster/masklog.h>
  20. #include "ocfs2.h"
  21. #include "inode.h"
  22. #include "alloc.h"
  23. #include "suballoc.h"
  24. #include "journal.h"
  25. #include "uptodate.h"
  26. #include "super.h"
  27. #include "buffer_head_io.h"
  28. #include "blockcheck.h"
  29. #include "refcounttree.h"
  30. #include "sysfile.h"
  31. #include "dlmglue.h"
  32. #include "extent_map.h"
  33. static inline struct ocfs2_refcount_tree *
  34. cache_info_to_refcount(struct ocfs2_caching_info *ci)
  35. {
  36. return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
  37. }
  38. static int ocfs2_validate_refcount_block(struct super_block *sb,
  39. struct buffer_head *bh)
  40. {
  41. int rc;
  42. struct ocfs2_refcount_block *rb =
  43. (struct ocfs2_refcount_block *)bh->b_data;
  44. mlog(0, "Validating refcount block %llu\n",
  45. (unsigned long long)bh->b_blocknr);
  46. BUG_ON(!buffer_uptodate(bh));
  47. /*
  48. * If the ecc fails, we return the error but otherwise
  49. * leave the filesystem running. We know any error is
  50. * local to this block.
  51. */
  52. rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
  53. if (rc) {
  54. mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
  55. (unsigned long long)bh->b_blocknr);
  56. return rc;
  57. }
  58. if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
  59. ocfs2_error(sb,
  60. "Refcount block #%llu has bad signature %.*s",
  61. (unsigned long long)bh->b_blocknr, 7,
  62. rb->rf_signature);
  63. return -EINVAL;
  64. }
  65. if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
  66. ocfs2_error(sb,
  67. "Refcount block #%llu has an invalid rf_blkno "
  68. "of %llu",
  69. (unsigned long long)bh->b_blocknr,
  70. (unsigned long long)le64_to_cpu(rb->rf_blkno));
  71. return -EINVAL;
  72. }
  73. if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
  74. ocfs2_error(sb,
  75. "Refcount block #%llu has an invalid "
  76. "rf_fs_generation of #%u",
  77. (unsigned long long)bh->b_blocknr,
  78. le32_to_cpu(rb->rf_fs_generation));
  79. return -EINVAL;
  80. }
  81. return 0;
  82. }
  83. static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
  84. u64 rb_blkno,
  85. struct buffer_head **bh)
  86. {
  87. int rc;
  88. struct buffer_head *tmp = *bh;
  89. rc = ocfs2_read_block(ci, rb_blkno, &tmp,
  90. ocfs2_validate_refcount_block);
  91. /* If ocfs2_read_block() got us a new bh, pass it up. */
  92. if (!rc && !*bh)
  93. *bh = tmp;
  94. return rc;
  95. }
  96. static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
  97. {
  98. struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
  99. return rf->rf_blkno;
  100. }
  101. static struct super_block *
  102. ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
  103. {
  104. struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
  105. return rf->rf_sb;
  106. }
  107. static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
  108. {
  109. struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
  110. spin_lock(&rf->rf_lock);
  111. }
  112. static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
  113. {
  114. struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
  115. spin_unlock(&rf->rf_lock);
  116. }
  117. static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
  118. {
  119. struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
  120. mutex_lock(&rf->rf_io_mutex);
  121. }
  122. static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
  123. {
  124. struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
  125. mutex_unlock(&rf->rf_io_mutex);
  126. }
  127. static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
  128. .co_owner = ocfs2_refcount_cache_owner,
  129. .co_get_super = ocfs2_refcount_cache_get_super,
  130. .co_cache_lock = ocfs2_refcount_cache_lock,
  131. .co_cache_unlock = ocfs2_refcount_cache_unlock,
  132. .co_io_lock = ocfs2_refcount_cache_io_lock,
  133. .co_io_unlock = ocfs2_refcount_cache_io_unlock,
  134. };
  135. static struct ocfs2_refcount_tree *
  136. ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
  137. {
  138. struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
  139. struct ocfs2_refcount_tree *tree = NULL;
  140. while (n) {
  141. tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
  142. if (blkno < tree->rf_blkno)
  143. n = n->rb_left;
  144. else if (blkno > tree->rf_blkno)
  145. n = n->rb_right;
  146. else
  147. return tree;
  148. }
  149. return NULL;
  150. }
  151. /* osb_lock is already locked. */
  152. static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
  153. struct ocfs2_refcount_tree *new)
  154. {
  155. u64 rf_blkno = new->rf_blkno;
  156. struct rb_node *parent = NULL;
  157. struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
  158. struct ocfs2_refcount_tree *tmp;
  159. while (*p) {
  160. parent = *p;
  161. tmp = rb_entry(parent, struct ocfs2_refcount_tree,
  162. rf_node);
  163. if (rf_blkno < tmp->rf_blkno)
  164. p = &(*p)->rb_left;
  165. else if (rf_blkno > tmp->rf_blkno)
  166. p = &(*p)->rb_right;
  167. else {
  168. /* This should never happen! */
  169. mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
  170. (unsigned long long)rf_blkno);
  171. BUG();
  172. }
  173. }
  174. rb_link_node(&new->rf_node, parent, p);
  175. rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
  176. }
  177. static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
  178. {
  179. ocfs2_metadata_cache_exit(&tree->rf_ci);
  180. ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
  181. ocfs2_lock_res_free(&tree->rf_lockres);
  182. kfree(tree);
  183. }
  184. static inline void
  185. ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
  186. struct ocfs2_refcount_tree *tree)
  187. {
  188. rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
  189. if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
  190. osb->osb_ref_tree_lru = NULL;
  191. }
  192. static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
  193. struct ocfs2_refcount_tree *tree)
  194. {
  195. spin_lock(&osb->osb_lock);
  196. ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
  197. spin_unlock(&osb->osb_lock);
  198. }
  199. void ocfs2_kref_remove_refcount_tree(struct kref *kref)
  200. {
  201. struct ocfs2_refcount_tree *tree =
  202. container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
  203. ocfs2_free_refcount_tree(tree);
  204. }
  205. static inline void
  206. ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
  207. {
  208. kref_get(&tree->rf_getcnt);
  209. }
  210. static inline void
  211. ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
  212. {
  213. kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
  214. }
  215. static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
  216. struct super_block *sb)
  217. {
  218. ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
  219. mutex_init(&new->rf_io_mutex);
  220. new->rf_sb = sb;
  221. spin_lock_init(&new->rf_lock);
  222. }
  223. static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
  224. struct ocfs2_refcount_tree *new,
  225. u64 rf_blkno, u32 generation)
  226. {
  227. init_rwsem(&new->rf_sem);
  228. ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
  229. rf_blkno, generation);
  230. }
  231. static struct ocfs2_refcount_tree*
  232. ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
  233. {
  234. struct ocfs2_refcount_tree *new;
  235. new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
  236. if (!new)
  237. return NULL;
  238. new->rf_blkno = rf_blkno;
  239. kref_init(&new->rf_getcnt);
  240. ocfs2_init_refcount_tree_ci(new, osb->sb);
  241. return new;
  242. }
  243. static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
  244. struct ocfs2_refcount_tree **ret_tree)
  245. {
  246. int ret = 0;
  247. struct ocfs2_refcount_tree *tree, *new = NULL;
  248. struct buffer_head *ref_root_bh = NULL;
  249. struct ocfs2_refcount_block *ref_rb;
  250. spin_lock(&osb->osb_lock);
  251. if (osb->osb_ref_tree_lru &&
  252. osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
  253. tree = osb->osb_ref_tree_lru;
  254. else
  255. tree = ocfs2_find_refcount_tree(osb, rf_blkno);
  256. if (tree)
  257. goto out;
  258. spin_unlock(&osb->osb_lock);
  259. new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
  260. if (!new) {
  261. ret = -ENOMEM;
  262. mlog_errno(ret);
  263. return ret;
  264. }
  265. /*
  266. * We need the generation to create the refcount tree lock and since
  267. * it isn't changed during the tree modification, we are safe here to
  268. * read without protection.
  269. * We also have to purge the cache after we create the lock since the
  270. * refcount block may have the stale data. It can only be trusted when
  271. * we hold the refcount lock.
  272. */
  273. ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
  274. if (ret) {
  275. mlog_errno(ret);
  276. ocfs2_metadata_cache_exit(&new->rf_ci);
  277. kfree(new);
  278. return ret;
  279. }
  280. ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  281. new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
  282. ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
  283. new->rf_generation);
  284. ocfs2_metadata_cache_purge(&new->rf_ci);
  285. spin_lock(&osb->osb_lock);
  286. tree = ocfs2_find_refcount_tree(osb, rf_blkno);
  287. if (tree)
  288. goto out;
  289. ocfs2_insert_refcount_tree(osb, new);
  290. tree = new;
  291. new = NULL;
  292. out:
  293. *ret_tree = tree;
  294. osb->osb_ref_tree_lru = tree;
  295. spin_unlock(&osb->osb_lock);
  296. if (new)
  297. ocfs2_free_refcount_tree(new);
  298. brelse(ref_root_bh);
  299. return ret;
  300. }
  301. static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
  302. {
  303. int ret;
  304. struct buffer_head *di_bh = NULL;
  305. struct ocfs2_dinode *di;
  306. ret = ocfs2_read_inode_block(inode, &di_bh);
  307. if (ret) {
  308. mlog_errno(ret);
  309. goto out;
  310. }
  311. BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
  312. di = (struct ocfs2_dinode *)di_bh->b_data;
  313. *ref_blkno = le64_to_cpu(di->i_refcount_loc);
  314. brelse(di_bh);
  315. out:
  316. return ret;
  317. }
  318. static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
  319. struct ocfs2_refcount_tree *tree, int rw)
  320. {
  321. int ret;
  322. ret = ocfs2_refcount_lock(tree, rw);
  323. if (ret) {
  324. mlog_errno(ret);
  325. goto out;
  326. }
  327. if (rw)
  328. down_write(&tree->rf_sem);
  329. else
  330. down_read(&tree->rf_sem);
  331. out:
  332. return ret;
  333. }
  334. /*
  335. * Lock the refcount tree pointed by ref_blkno and return the tree.
  336. * In most case, we lock the tree and read the refcount block.
  337. * So read it here if the caller really needs it.
  338. *
  339. * If the tree has been re-created by other node, it will free the
  340. * old one and re-create it.
  341. */
  342. int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
  343. u64 ref_blkno, int rw,
  344. struct ocfs2_refcount_tree **ret_tree,
  345. struct buffer_head **ref_bh)
  346. {
  347. int ret, delete_tree = 0;
  348. struct ocfs2_refcount_tree *tree = NULL;
  349. struct buffer_head *ref_root_bh = NULL;
  350. struct ocfs2_refcount_block *rb;
  351. again:
  352. ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
  353. if (ret) {
  354. mlog_errno(ret);
  355. return ret;
  356. }
  357. ocfs2_refcount_tree_get(tree);
  358. ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
  359. if (ret) {
  360. mlog_errno(ret);
  361. ocfs2_refcount_tree_put(tree);
  362. goto out;
  363. }
  364. ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
  365. &ref_root_bh);
  366. if (ret) {
  367. mlog_errno(ret);
  368. ocfs2_unlock_refcount_tree(osb, tree, rw);
  369. ocfs2_refcount_tree_put(tree);
  370. goto out;
  371. }
  372. rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  373. /*
  374. * If the refcount block has been freed and re-created, we may need
  375. * to recreate the refcount tree also.
  376. *
  377. * Here we just remove the tree from the rb-tree, and the last
  378. * kref holder will unlock and delete this refcount_tree.
  379. * Then we goto "again" and ocfs2_get_refcount_tree will create
  380. * the new refcount tree for us.
  381. */
  382. if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
  383. if (!tree->rf_removed) {
  384. ocfs2_erase_refcount_tree_from_list(osb, tree);
  385. tree->rf_removed = 1;
  386. delete_tree = 1;
  387. }
  388. ocfs2_unlock_refcount_tree(osb, tree, rw);
  389. /*
  390. * We get an extra reference when we create the refcount
  391. * tree, so another put will destroy it.
  392. */
  393. if (delete_tree)
  394. ocfs2_refcount_tree_put(tree);
  395. brelse(ref_root_bh);
  396. ref_root_bh = NULL;
  397. goto again;
  398. }
  399. *ret_tree = tree;
  400. if (ref_bh) {
  401. *ref_bh = ref_root_bh;
  402. ref_root_bh = NULL;
  403. }
  404. out:
  405. brelse(ref_root_bh);
  406. return ret;
  407. }
  408. int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
  409. struct ocfs2_refcount_tree **ret_tree,
  410. struct buffer_head **ref_bh)
  411. {
  412. int ret;
  413. u64 ref_blkno;
  414. ret = ocfs2_get_refcount_block(inode, &ref_blkno);
  415. if (ret) {
  416. mlog_errno(ret);
  417. return ret;
  418. }
  419. return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
  420. rw, ret_tree, ref_bh);
  421. }
  422. void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
  423. struct ocfs2_refcount_tree *tree, int rw)
  424. {
  425. if (rw)
  426. up_write(&tree->rf_sem);
  427. else
  428. up_read(&tree->rf_sem);
  429. ocfs2_refcount_unlock(tree, rw);
  430. ocfs2_refcount_tree_put(tree);
  431. }
  432. void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
  433. {
  434. struct rb_node *node;
  435. struct ocfs2_refcount_tree *tree;
  436. struct rb_root *root = &osb->osb_rf_lock_tree;
  437. while ((node = rb_last(root)) != NULL) {
  438. tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
  439. mlog(0, "Purge tree %llu\n",
  440. (unsigned long long) tree->rf_blkno);
  441. rb_erase(&tree->rf_node, root);
  442. ocfs2_free_refcount_tree(tree);
  443. }
  444. }
  445. /*
  446. * Create a refcount tree for an inode.
  447. * We take for granted that the inode is already locked.
  448. */
  449. static int ocfs2_create_refcount_tree(struct inode *inode,
  450. struct buffer_head *di_bh)
  451. {
  452. int ret;
  453. handle_t *handle = NULL;
  454. struct ocfs2_alloc_context *meta_ac = NULL;
  455. struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
  456. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  457. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  458. struct buffer_head *new_bh = NULL;
  459. struct ocfs2_refcount_block *rb;
  460. struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
  461. u16 suballoc_bit_start;
  462. u32 num_got;
  463. u64 first_blkno;
  464. BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
  465. mlog(0, "create tree for inode %lu\n", inode->i_ino);
  466. ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
  467. if (ret) {
  468. mlog_errno(ret);
  469. goto out;
  470. }
  471. handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
  472. if (IS_ERR(handle)) {
  473. ret = PTR_ERR(handle);
  474. mlog_errno(ret);
  475. goto out;
  476. }
  477. ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
  478. OCFS2_JOURNAL_ACCESS_WRITE);
  479. if (ret) {
  480. mlog_errno(ret);
  481. goto out_commit;
  482. }
  483. ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
  484. &suballoc_bit_start, &num_got,
  485. &first_blkno);
  486. if (ret) {
  487. mlog_errno(ret);
  488. goto out_commit;
  489. }
  490. new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
  491. if (!new_tree) {
  492. ret = -ENOMEM;
  493. mlog_errno(ret);
  494. goto out_commit;
  495. }
  496. new_bh = sb_getblk(inode->i_sb, first_blkno);
  497. ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
  498. ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
  499. OCFS2_JOURNAL_ACCESS_CREATE);
  500. if (ret) {
  501. mlog_errno(ret);
  502. goto out_commit;
  503. }
  504. /* Initialize ocfs2_refcount_block. */
  505. rb = (struct ocfs2_refcount_block *)new_bh->b_data;
  506. memset(rb, 0, inode->i_sb->s_blocksize);
  507. strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
  508. rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
  509. rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
  510. rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
  511. rb->rf_blkno = cpu_to_le64(first_blkno);
  512. rb->rf_count = cpu_to_le32(1);
  513. rb->rf_records.rl_count =
  514. cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
  515. spin_lock(&osb->osb_lock);
  516. rb->rf_generation = osb->s_next_generation++;
  517. spin_unlock(&osb->osb_lock);
  518. ocfs2_journal_dirty(handle, new_bh);
  519. spin_lock(&oi->ip_lock);
  520. oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
  521. di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
  522. di->i_refcount_loc = cpu_to_le64(first_blkno);
  523. spin_unlock(&oi->ip_lock);
  524. mlog(0, "created tree for inode %lu, refblock %llu\n",
  525. inode->i_ino, (unsigned long long)first_blkno);
  526. ocfs2_journal_dirty(handle, di_bh);
  527. /*
  528. * We have to init the tree lock here since it will use
  529. * the generation number to create it.
  530. */
  531. new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
  532. ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
  533. new_tree->rf_generation);
  534. spin_lock(&osb->osb_lock);
  535. tree = ocfs2_find_refcount_tree(osb, first_blkno);
  536. /*
  537. * We've just created a new refcount tree in this block. If
  538. * we found a refcount tree on the ocfs2_super, it must be
  539. * one we just deleted. We free the old tree before
  540. * inserting the new tree.
  541. */
  542. BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
  543. if (tree)
  544. ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
  545. ocfs2_insert_refcount_tree(osb, new_tree);
  546. spin_unlock(&osb->osb_lock);
  547. new_tree = NULL;
  548. if (tree)
  549. ocfs2_refcount_tree_put(tree);
  550. out_commit:
  551. ocfs2_commit_trans(osb, handle);
  552. out:
  553. if (new_tree) {
  554. ocfs2_metadata_cache_exit(&new_tree->rf_ci);
  555. kfree(new_tree);
  556. }
  557. brelse(new_bh);
  558. if (meta_ac)
  559. ocfs2_free_alloc_context(meta_ac);
  560. return ret;
  561. }
  562. static int ocfs2_set_refcount_tree(struct inode *inode,
  563. struct buffer_head *di_bh,
  564. u64 refcount_loc)
  565. {
  566. int ret;
  567. handle_t *handle = NULL;
  568. struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
  569. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  570. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  571. struct buffer_head *ref_root_bh = NULL;
  572. struct ocfs2_refcount_block *rb;
  573. struct ocfs2_refcount_tree *ref_tree;
  574. BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
  575. ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
  576. &ref_tree, &ref_root_bh);
  577. if (ret) {
  578. mlog_errno(ret);
  579. return ret;
  580. }
  581. handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
  582. if (IS_ERR(handle)) {
  583. ret = PTR_ERR(handle);
  584. mlog_errno(ret);
  585. goto out;
  586. }
  587. ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
  588. OCFS2_JOURNAL_ACCESS_WRITE);
  589. if (ret) {
  590. mlog_errno(ret);
  591. goto out_commit;
  592. }
  593. ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
  594. OCFS2_JOURNAL_ACCESS_WRITE);
  595. if (ret) {
  596. mlog_errno(ret);
  597. goto out_commit;
  598. }
  599. rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  600. le32_add_cpu(&rb->rf_count, 1);
  601. ocfs2_journal_dirty(handle, ref_root_bh);
  602. spin_lock(&oi->ip_lock);
  603. oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
  604. di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
  605. di->i_refcount_loc = cpu_to_le64(refcount_loc);
  606. spin_unlock(&oi->ip_lock);
  607. ocfs2_journal_dirty(handle, di_bh);
  608. out_commit:
  609. ocfs2_commit_trans(osb, handle);
  610. out:
  611. ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
  612. brelse(ref_root_bh);
  613. return ret;
  614. }
  615. int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
  616. {
  617. int ret, delete_tree = 0;
  618. handle_t *handle = NULL;
  619. struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
  620. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  621. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  622. struct ocfs2_refcount_block *rb;
  623. struct inode *alloc_inode = NULL;
  624. struct buffer_head *alloc_bh = NULL;
  625. struct buffer_head *blk_bh = NULL;
  626. struct ocfs2_refcount_tree *ref_tree;
  627. int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
  628. u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
  629. u16 bit = 0;
  630. if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
  631. return 0;
  632. BUG_ON(!ref_blkno);
  633. ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
  634. if (ret) {
  635. mlog_errno(ret);
  636. return ret;
  637. }
  638. rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
  639. /*
  640. * If we are the last user, we need to free the block.
  641. * So lock the allocator ahead.
  642. */
  643. if (le32_to_cpu(rb->rf_count) == 1) {
  644. blk = le64_to_cpu(rb->rf_blkno);
  645. bit = le16_to_cpu(rb->rf_suballoc_bit);
  646. bg_blkno = ocfs2_which_suballoc_group(blk, bit);
  647. alloc_inode = ocfs2_get_system_file_inode(osb,
  648. EXTENT_ALLOC_SYSTEM_INODE,
  649. le16_to_cpu(rb->rf_suballoc_slot));
  650. if (!alloc_inode) {
  651. ret = -ENOMEM;
  652. mlog_errno(ret);
  653. goto out;
  654. }
  655. mutex_lock(&alloc_inode->i_mutex);
  656. ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
  657. if (ret) {
  658. mlog_errno(ret);
  659. goto out_mutex;
  660. }
  661. credits += OCFS2_SUBALLOC_FREE;
  662. }
  663. handle = ocfs2_start_trans(osb, credits);
  664. if (IS_ERR(handle)) {
  665. ret = PTR_ERR(handle);
  666. mlog_errno(ret);
  667. goto out_unlock;
  668. }
  669. ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
  670. OCFS2_JOURNAL_ACCESS_WRITE);
  671. if (ret) {
  672. mlog_errno(ret);
  673. goto out_commit;
  674. }
  675. ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
  676. OCFS2_JOURNAL_ACCESS_WRITE);
  677. if (ret) {
  678. mlog_errno(ret);
  679. goto out_commit;
  680. }
  681. spin_lock(&oi->ip_lock);
  682. oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
  683. di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
  684. di->i_refcount_loc = 0;
  685. spin_unlock(&oi->ip_lock);
  686. ocfs2_journal_dirty(handle, di_bh);
  687. le32_add_cpu(&rb->rf_count , -1);
  688. ocfs2_journal_dirty(handle, blk_bh);
  689. if (!rb->rf_count) {
  690. delete_tree = 1;
  691. ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
  692. ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
  693. alloc_bh, bit, bg_blkno, 1);
  694. if (ret)
  695. mlog_errno(ret);
  696. }
  697. out_commit:
  698. ocfs2_commit_trans(osb, handle);
  699. out_unlock:
  700. if (alloc_inode) {
  701. ocfs2_inode_unlock(alloc_inode, 1);
  702. brelse(alloc_bh);
  703. }
  704. out_mutex:
  705. if (alloc_inode) {
  706. mutex_unlock(&alloc_inode->i_mutex);
  707. iput(alloc_inode);
  708. }
  709. out:
  710. ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
  711. if (delete_tree)
  712. ocfs2_refcount_tree_put(ref_tree);
  713. brelse(blk_bh);
  714. return ret;
  715. }
  716. static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
  717. struct buffer_head *ref_leaf_bh,
  718. u64 cpos, unsigned int len,
  719. struct ocfs2_refcount_rec *ret_rec,
  720. int *index)
  721. {
  722. int i = 0;
  723. struct ocfs2_refcount_block *rb =
  724. (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  725. struct ocfs2_refcount_rec *rec = NULL;
  726. for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
  727. rec = &rb->rf_records.rl_recs[i];
  728. if (le64_to_cpu(rec->r_cpos) +
  729. le32_to_cpu(rec->r_clusters) <= cpos)
  730. continue;
  731. else if (le64_to_cpu(rec->r_cpos) > cpos)
  732. break;
  733. /* ok, cpos fail in this rec. Just return. */
  734. if (ret_rec)
  735. *ret_rec = *rec;
  736. goto out;
  737. }
  738. if (ret_rec) {
  739. /* We meet with a hole here, so fake the rec. */
  740. ret_rec->r_cpos = cpu_to_le64(cpos);
  741. ret_rec->r_refcount = 0;
  742. if (i < le16_to_cpu(rb->rf_records.rl_used) &&
  743. le64_to_cpu(rec->r_cpos) < cpos + len)
  744. ret_rec->r_clusters =
  745. cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
  746. else
  747. ret_rec->r_clusters = cpu_to_le32(len);
  748. }
  749. out:
  750. *index = i;
  751. }
  752. /*
  753. * Given a cpos and len, try to find the refcount record which contains cpos.
  754. * 1. If cpos can be found in one refcount record, return the record.
  755. * 2. If cpos can't be found, return a fake record which start from cpos
  756. * and end at a small value between cpos+len and start of the next record.
  757. * This fake record has r_refcount = 0.
  758. */
  759. static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
  760. struct buffer_head *ref_root_bh,
  761. u64 cpos, unsigned int len,
  762. struct ocfs2_refcount_rec *ret_rec,
  763. int *index,
  764. struct buffer_head **ret_bh)
  765. {
  766. int ret = 0, i, found;
  767. u32 low_cpos;
  768. struct ocfs2_extent_list *el;
  769. struct ocfs2_extent_rec *tmp, *rec = NULL;
  770. struct ocfs2_extent_block *eb;
  771. struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
  772. struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
  773. struct ocfs2_refcount_block *rb =
  774. (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  775. if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
  776. ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
  777. ret_rec, index);
  778. *ret_bh = ref_root_bh;
  779. get_bh(ref_root_bh);
  780. return 0;
  781. }
  782. el = &rb->rf_list;
  783. low_cpos = cpos & OCFS2_32BIT_POS_MASK;
  784. if (el->l_tree_depth) {
  785. ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
  786. if (ret) {
  787. mlog_errno(ret);
  788. goto out;
  789. }
  790. eb = (struct ocfs2_extent_block *) eb_bh->b_data;
  791. el = &eb->h_list;
  792. if (el->l_tree_depth) {
  793. ocfs2_error(sb,
  794. "refcount tree %llu has non zero tree "
  795. "depth in leaf btree tree block %llu\n",
  796. (unsigned long long)ocfs2_metadata_cache_owner(ci),
  797. (unsigned long long)eb_bh->b_blocknr);
  798. ret = -EROFS;
  799. goto out;
  800. }
  801. }
  802. found = 0;
  803. for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
  804. rec = &el->l_recs[i];
  805. if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
  806. found = 1;
  807. break;
  808. }
  809. }
  810. /* adjust len when we have ocfs2_extent_rec after it. */
  811. if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
  812. tmp = &el->l_recs[i+1];
  813. if (le32_to_cpu(tmp->e_cpos) < cpos + len)
  814. len = le32_to_cpu(tmp->e_cpos) - cpos;
  815. }
  816. ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
  817. &ref_leaf_bh);
  818. if (ret) {
  819. mlog_errno(ret);
  820. goto out;
  821. }
  822. ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
  823. ret_rec, index);
  824. *ret_bh = ref_leaf_bh;
  825. out:
  826. brelse(eb_bh);
  827. return ret;
  828. }
  829. enum ocfs2_ref_rec_contig {
  830. REF_CONTIG_NONE = 0,
  831. REF_CONTIG_LEFT,
  832. REF_CONTIG_RIGHT,
  833. REF_CONTIG_LEFTRIGHT,
  834. };
  835. static enum ocfs2_ref_rec_contig
  836. ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
  837. int index)
  838. {
  839. if ((rb->rf_records.rl_recs[index].r_refcount ==
  840. rb->rf_records.rl_recs[index + 1].r_refcount) &&
  841. (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
  842. le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
  843. le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
  844. return REF_CONTIG_RIGHT;
  845. return REF_CONTIG_NONE;
  846. }
  847. static enum ocfs2_ref_rec_contig
  848. ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
  849. int index)
  850. {
  851. enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
  852. if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
  853. ret = ocfs2_refcount_rec_adjacent(rb, index);
  854. if (index > 0) {
  855. enum ocfs2_ref_rec_contig tmp;
  856. tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
  857. if (tmp == REF_CONTIG_RIGHT) {
  858. if (ret == REF_CONTIG_RIGHT)
  859. ret = REF_CONTIG_LEFTRIGHT;
  860. else
  861. ret = REF_CONTIG_LEFT;
  862. }
  863. }
  864. return ret;
  865. }
  866. static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
  867. int index)
  868. {
  869. BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
  870. rb->rf_records.rl_recs[index+1].r_refcount);
  871. le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
  872. le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
  873. if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
  874. memmove(&rb->rf_records.rl_recs[index + 1],
  875. &rb->rf_records.rl_recs[index + 2],
  876. sizeof(struct ocfs2_refcount_rec) *
  877. (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
  878. memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
  879. 0, sizeof(struct ocfs2_refcount_rec));
  880. le16_add_cpu(&rb->rf_records.rl_used, -1);
  881. }
  882. /*
  883. * Merge the refcount rec if we are contiguous with the adjacent recs.
  884. */
  885. static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
  886. int index)
  887. {
  888. enum ocfs2_ref_rec_contig contig =
  889. ocfs2_refcount_rec_contig(rb, index);
  890. if (contig == REF_CONTIG_NONE)
  891. return;
  892. if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
  893. BUG_ON(index == 0);
  894. index--;
  895. }
  896. ocfs2_rotate_refcount_rec_left(rb, index);
  897. if (contig == REF_CONTIG_LEFTRIGHT)
  898. ocfs2_rotate_refcount_rec_left(rb, index);
  899. }
  900. /*
  901. * Change the refcount indexed by "index" in ref_bh.
  902. * If refcount reaches 0, remove it.
  903. */
  904. static int ocfs2_change_refcount_rec(handle_t *handle,
  905. struct ocfs2_caching_info *ci,
  906. struct buffer_head *ref_leaf_bh,
  907. int index, int change)
  908. {
  909. int ret;
  910. struct ocfs2_refcount_block *rb =
  911. (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  912. struct ocfs2_refcount_list *rl = &rb->rf_records;
  913. struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
  914. ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
  915. OCFS2_JOURNAL_ACCESS_WRITE);
  916. if (ret) {
  917. mlog_errno(ret);
  918. goto out;
  919. }
  920. mlog(0, "change index %d, old count %u, change %d\n", index,
  921. le32_to_cpu(rec->r_refcount), change);
  922. le32_add_cpu(&rec->r_refcount, change);
  923. if (!rec->r_refcount) {
  924. if (index != le16_to_cpu(rl->rl_used) - 1) {
  925. memmove(rec, rec + 1,
  926. (le16_to_cpu(rl->rl_used) - index - 1) *
  927. sizeof(struct ocfs2_refcount_rec));
  928. memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
  929. 0, sizeof(struct ocfs2_refcount_rec));
  930. }
  931. le16_add_cpu(&rl->rl_used, -1);
  932. } else
  933. ocfs2_refcount_rec_merge(rb, index);
  934. ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
  935. if (ret)
  936. mlog_errno(ret);
  937. out:
  938. return ret;
  939. }
  940. static int ocfs2_expand_inline_ref_root(handle_t *handle,
  941. struct ocfs2_caching_info *ci,
  942. struct buffer_head *ref_root_bh,
  943. struct buffer_head **ref_leaf_bh,
  944. struct ocfs2_alloc_context *meta_ac)
  945. {
  946. int ret;
  947. u16 suballoc_bit_start;
  948. u32 num_got;
  949. u64 blkno;
  950. struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
  951. struct buffer_head *new_bh = NULL;
  952. struct ocfs2_refcount_block *new_rb;
  953. struct ocfs2_refcount_block *root_rb =
  954. (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  955. ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
  956. OCFS2_JOURNAL_ACCESS_WRITE);
  957. if (ret) {
  958. mlog_errno(ret);
  959. goto out;
  960. }
  961. ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
  962. &suballoc_bit_start, &num_got,
  963. &blkno);
  964. if (ret) {
  965. mlog_errno(ret);
  966. goto out;
  967. }
  968. new_bh = sb_getblk(sb, blkno);
  969. if (new_bh == NULL) {
  970. ret = -EIO;
  971. mlog_errno(ret);
  972. goto out;
  973. }
  974. ocfs2_set_new_buffer_uptodate(ci, new_bh);
  975. ret = ocfs2_journal_access_rb(handle, ci, new_bh,
  976. OCFS2_JOURNAL_ACCESS_CREATE);
  977. if (ret) {
  978. mlog_errno(ret);
  979. goto out;
  980. }
  981. /*
  982. * Initialize ocfs2_refcount_block.
  983. * It should contain the same information as the old root.
  984. * so just memcpy it and change the corresponding field.
  985. */
  986. memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
  987. new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
  988. new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
  989. new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
  990. new_rb->rf_blkno = cpu_to_le64(blkno);
  991. new_rb->rf_cpos = cpu_to_le32(0);
  992. new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
  993. new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
  994. ocfs2_journal_dirty(handle, new_bh);
  995. /* Now change the root. */
  996. memset(&root_rb->rf_list, 0, sb->s_blocksize -
  997. offsetof(struct ocfs2_refcount_block, rf_list));
  998. root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
  999. root_rb->rf_clusters = cpu_to_le32(1);
  1000. root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
  1001. root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
  1002. root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
  1003. root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
  1004. ocfs2_journal_dirty(handle, ref_root_bh);
  1005. mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
  1006. le16_to_cpu(new_rb->rf_records.rl_used));
  1007. *ref_leaf_bh = new_bh;
  1008. new_bh = NULL;
  1009. out:
  1010. brelse(new_bh);
  1011. return ret;
  1012. }
  1013. static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
  1014. struct ocfs2_refcount_rec *next)
  1015. {
  1016. if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
  1017. ocfs2_get_ref_rec_low_cpos(next))
  1018. return 1;
  1019. return 0;
  1020. }
  1021. static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
  1022. {
  1023. const struct ocfs2_refcount_rec *l = a, *r = b;
  1024. u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
  1025. u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
  1026. if (l_cpos > r_cpos)
  1027. return 1;
  1028. if (l_cpos < r_cpos)
  1029. return -1;
  1030. return 0;
  1031. }
  1032. static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
  1033. {
  1034. const struct ocfs2_refcount_rec *l = a, *r = b;
  1035. u64 l_cpos = le64_to_cpu(l->r_cpos);
  1036. u64 r_cpos = le64_to_cpu(r->r_cpos);
  1037. if (l_cpos > r_cpos)
  1038. return 1;
  1039. if (l_cpos < r_cpos)
  1040. return -1;
  1041. return 0;
  1042. }
  1043. static void swap_refcount_rec(void *a, void *b, int size)
  1044. {
  1045. struct ocfs2_refcount_rec *l = a, *r = b, tmp;
  1046. tmp = *(struct ocfs2_refcount_rec *)l;
  1047. *(struct ocfs2_refcount_rec *)l =
  1048. *(struct ocfs2_refcount_rec *)r;
  1049. *(struct ocfs2_refcount_rec *)r = tmp;
  1050. }
  1051. /*
  1052. * The refcount cpos are ordered by their 64bit cpos,
  1053. * But we will use the low 32 bit to be the e_cpos in the b-tree.
  1054. * So we need to make sure that this pos isn't intersected with others.
  1055. *
  1056. * Note: The refcount block is already sorted by their low 32 bit cpos,
  1057. * So just try the middle pos first, and we will exit when we find
  1058. * the good position.
  1059. */
  1060. static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
  1061. u32 *split_pos, int *split_index)
  1062. {
  1063. int num_used = le16_to_cpu(rl->rl_used);
  1064. int delta, middle = num_used / 2;
  1065. for (delta = 0; delta < middle; delta++) {
  1066. /* Let's check delta earlier than middle */
  1067. if (ocfs2_refcount_rec_no_intersect(
  1068. &rl->rl_recs[middle - delta - 1],
  1069. &rl->rl_recs[middle - delta])) {
  1070. *split_index = middle - delta;
  1071. break;
  1072. }
  1073. /* For even counts, don't walk off the end */
  1074. if ((middle + delta + 1) == num_used)
  1075. continue;
  1076. /* Now try delta past middle */
  1077. if (ocfs2_refcount_rec_no_intersect(
  1078. &rl->rl_recs[middle + delta],
  1079. &rl->rl_recs[middle + delta + 1])) {
  1080. *split_index = middle + delta + 1;
  1081. break;
  1082. }
  1083. }
  1084. if (delta >= middle)
  1085. return -ENOSPC;
  1086. *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
  1087. return 0;
  1088. }
  1089. static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
  1090. struct buffer_head *new_bh,
  1091. u32 *split_cpos)
  1092. {
  1093. int split_index = 0, num_moved, ret;
  1094. u32 cpos = 0;
  1095. struct ocfs2_refcount_block *rb =
  1096. (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1097. struct ocfs2_refcount_list *rl = &rb->rf_records;
  1098. struct ocfs2_refcount_block *new_rb =
  1099. (struct ocfs2_refcount_block *)new_bh->b_data;
  1100. struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
  1101. mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
  1102. (unsigned long long)ref_leaf_bh->b_blocknr,
  1103. le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
  1104. /*
  1105. * XXX: Improvement later.
  1106. * If we know all the high 32 bit cpos is the same, no need to sort.
  1107. *
  1108. * In order to make the whole process safe, we do:
  1109. * 1. sort the entries by their low 32 bit cpos first so that we can
  1110. * find the split cpos easily.
  1111. * 2. call ocfs2_insert_extent to insert the new refcount block.
  1112. * 3. move the refcount rec to the new block.
  1113. * 4. sort the entries by their 64 bit cpos.
  1114. * 5. dirty the new_rb and rb.
  1115. */
  1116. sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
  1117. sizeof(struct ocfs2_refcount_rec),
  1118. cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
  1119. ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
  1120. if (ret) {
  1121. mlog_errno(ret);
  1122. return ret;
  1123. }
  1124. new_rb->rf_cpos = cpu_to_le32(cpos);
  1125. /* move refcount records starting from split_index to the new block. */
  1126. num_moved = le16_to_cpu(rl->rl_used) - split_index;
  1127. memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
  1128. num_moved * sizeof(struct ocfs2_refcount_rec));
  1129. /*ok, remove the entries we just moved over to the other block. */
  1130. memset(&rl->rl_recs[split_index], 0,
  1131. num_moved * sizeof(struct ocfs2_refcount_rec));
  1132. /* change old and new rl_used accordingly. */
  1133. le16_add_cpu(&rl->rl_used, -num_moved);
  1134. new_rl->rl_used = cpu_to_le32(num_moved);
  1135. sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
  1136. sizeof(struct ocfs2_refcount_rec),
  1137. cmp_refcount_rec_by_cpos, swap_refcount_rec);
  1138. sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
  1139. sizeof(struct ocfs2_refcount_rec),
  1140. cmp_refcount_rec_by_cpos, swap_refcount_rec);
  1141. *split_cpos = cpos;
  1142. return 0;
  1143. }
  1144. static int ocfs2_new_leaf_refcount_block(handle_t *handle,
  1145. struct ocfs2_caching_info *ci,
  1146. struct buffer_head *ref_root_bh,
  1147. struct buffer_head *ref_leaf_bh,
  1148. struct ocfs2_alloc_context *meta_ac)
  1149. {
  1150. int ret;
  1151. u16 suballoc_bit_start;
  1152. u32 num_got, new_cpos;
  1153. u64 blkno;
  1154. struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
  1155. struct ocfs2_refcount_block *root_rb =
  1156. (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  1157. struct buffer_head *new_bh = NULL;
  1158. struct ocfs2_refcount_block *new_rb;
  1159. struct ocfs2_extent_tree ref_et;
  1160. BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
  1161. ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
  1162. OCFS2_JOURNAL_ACCESS_WRITE);
  1163. if (ret) {
  1164. mlog_errno(ret);
  1165. goto out;
  1166. }
  1167. ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
  1168. OCFS2_JOURNAL_ACCESS_WRITE);
  1169. if (ret) {
  1170. mlog_errno(ret);
  1171. goto out;
  1172. }
  1173. ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
  1174. &suballoc_bit_start, &num_got,
  1175. &blkno);
  1176. if (ret) {
  1177. mlog_errno(ret);
  1178. goto out;
  1179. }
  1180. new_bh = sb_getblk(sb, blkno);
  1181. if (new_bh == NULL) {
  1182. ret = -EIO;
  1183. mlog_errno(ret);
  1184. goto out;
  1185. }
  1186. ocfs2_set_new_buffer_uptodate(ci, new_bh);
  1187. ret = ocfs2_journal_access_rb(handle, ci, new_bh,
  1188. OCFS2_JOURNAL_ACCESS_CREATE);
  1189. if (ret) {
  1190. mlog_errno(ret);
  1191. goto out;
  1192. }
  1193. /* Initialize ocfs2_refcount_block. */
  1194. new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
  1195. memset(new_rb, 0, sb->s_blocksize);
  1196. strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
  1197. new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
  1198. new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
  1199. new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
  1200. new_rb->rf_blkno = cpu_to_le64(blkno);
  1201. new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
  1202. new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
  1203. new_rb->rf_records.rl_count =
  1204. cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
  1205. new_rb->rf_generation = root_rb->rf_generation;
  1206. ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
  1207. if (ret) {
  1208. mlog_errno(ret);
  1209. goto out;
  1210. }
  1211. ocfs2_journal_dirty(handle, ref_leaf_bh);
  1212. ocfs2_journal_dirty(handle, new_bh);
  1213. ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
  1214. mlog(0, "insert new leaf block %llu at %u\n",
  1215. (unsigned long long)new_bh->b_blocknr, new_cpos);
  1216. /* Insert the new leaf block with the specific offset cpos. */
  1217. ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
  1218. 1, 0, meta_ac);
  1219. if (ret)
  1220. mlog_errno(ret);
  1221. out:
  1222. brelse(new_bh);
  1223. return ret;
  1224. }
  1225. static int ocfs2_expand_refcount_tree(handle_t *handle,
  1226. struct ocfs2_caching_info *ci,
  1227. struct buffer_head *ref_root_bh,
  1228. struct buffer_head *ref_leaf_bh,
  1229. struct ocfs2_alloc_context *meta_ac)
  1230. {
  1231. int ret;
  1232. struct buffer_head *expand_bh = NULL;
  1233. if (ref_root_bh == ref_leaf_bh) {
  1234. /*
  1235. * the old root bh hasn't been expanded to a b-tree,
  1236. * so expand it first.
  1237. */
  1238. ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
  1239. &expand_bh, meta_ac);
  1240. if (ret) {
  1241. mlog_errno(ret);
  1242. goto out;
  1243. }
  1244. } else {
  1245. expand_bh = ref_leaf_bh;
  1246. get_bh(expand_bh);
  1247. }
  1248. /* Now add a new refcount block into the tree.*/
  1249. ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
  1250. expand_bh, meta_ac);
  1251. if (ret)
  1252. mlog_errno(ret);
  1253. out:
  1254. brelse(expand_bh);
  1255. return ret;
  1256. }
  1257. /*
  1258. * Adjust the extent rec in b-tree representing ref_leaf_bh.
  1259. *
  1260. * Only called when we have inserted a new refcount rec at index 0
  1261. * which means ocfs2_extent_rec.e_cpos may need some change.
  1262. */
  1263. static int ocfs2_adjust_refcount_rec(handle_t *handle,
  1264. struct ocfs2_caching_info *ci,
  1265. struct buffer_head *ref_root_bh,
  1266. struct buffer_head *ref_leaf_bh,
  1267. struct ocfs2_refcount_rec *rec)
  1268. {
  1269. int ret = 0, i;
  1270. u32 new_cpos, old_cpos;
  1271. struct ocfs2_path *path = NULL;
  1272. struct ocfs2_extent_tree et;
  1273. struct ocfs2_refcount_block *rb =
  1274. (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  1275. struct ocfs2_extent_list *el;
  1276. if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
  1277. goto out;
  1278. rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1279. old_cpos = le32_to_cpu(rb->rf_cpos);
  1280. new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
  1281. if (old_cpos <= new_cpos)
  1282. goto out;
  1283. ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
  1284. path = ocfs2_new_path_from_et(&et);
  1285. if (!path) {
  1286. ret = -ENOMEM;
  1287. mlog_errno(ret);
  1288. goto out;
  1289. }
  1290. ret = ocfs2_find_path(ci, path, old_cpos);
  1291. if (ret) {
  1292. mlog_errno(ret);
  1293. goto out;
  1294. }
  1295. /*
  1296. * 2 more credits, one for the leaf refcount block, one for
  1297. * the extent block contains the extent rec.
  1298. */
  1299. ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
  1300. if (ret < 0) {
  1301. mlog_errno(ret);
  1302. goto out;
  1303. }
  1304. ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
  1305. OCFS2_JOURNAL_ACCESS_WRITE);
  1306. if (ret < 0) {
  1307. mlog_errno(ret);
  1308. goto out;
  1309. }
  1310. ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
  1311. OCFS2_JOURNAL_ACCESS_WRITE);
  1312. if (ret < 0) {
  1313. mlog_errno(ret);
  1314. goto out;
  1315. }
  1316. /* change the leaf extent block first. */
  1317. el = path_leaf_el(path);
  1318. for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
  1319. if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
  1320. break;
  1321. BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
  1322. el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
  1323. /* change the r_cpos in the leaf block. */
  1324. rb->rf_cpos = cpu_to_le32(new_cpos);
  1325. ocfs2_journal_dirty(handle, path_leaf_bh(path));
  1326. ocfs2_journal_dirty(handle, ref_leaf_bh);
  1327. out:
  1328. ocfs2_free_path(path);
  1329. return ret;
  1330. }
  1331. static int ocfs2_insert_refcount_rec(handle_t *handle,
  1332. struct ocfs2_caching_info *ci,
  1333. struct buffer_head *ref_root_bh,
  1334. struct buffer_head *ref_leaf_bh,
  1335. struct ocfs2_refcount_rec *rec,
  1336. int index,
  1337. struct ocfs2_alloc_context *meta_ac)
  1338. {
  1339. int ret;
  1340. struct ocfs2_refcount_block *rb =
  1341. (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1342. struct ocfs2_refcount_list *rf_list = &rb->rf_records;
  1343. struct buffer_head *new_bh = NULL;
  1344. BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
  1345. if (rf_list->rl_used == rf_list->rl_count) {
  1346. u64 cpos = le64_to_cpu(rec->r_cpos);
  1347. u32 len = le32_to_cpu(rec->r_clusters);
  1348. ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
  1349. ref_leaf_bh, meta_ac);
  1350. if (ret) {
  1351. mlog_errno(ret);
  1352. goto out;
  1353. }
  1354. ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
  1355. cpos, len, NULL, &index,
  1356. &new_bh);
  1357. if (ret) {
  1358. mlog_errno(ret);
  1359. goto out;
  1360. }
  1361. ref_leaf_bh = new_bh;
  1362. rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1363. rf_list = &rb->rf_records;
  1364. }
  1365. ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
  1366. OCFS2_JOURNAL_ACCESS_WRITE);
  1367. if (ret) {
  1368. mlog_errno(ret);
  1369. goto out;
  1370. }
  1371. if (index < le16_to_cpu(rf_list->rl_used))
  1372. memmove(&rf_list->rl_recs[index + 1],
  1373. &rf_list->rl_recs[index],
  1374. (le16_to_cpu(rf_list->rl_used) - index) *
  1375. sizeof(struct ocfs2_refcount_rec));
  1376. mlog(0, "insert refcount record start %llu, len %u, count %u "
  1377. "to leaf block %llu at index %d\n",
  1378. (unsigned long long)le64_to_cpu(rec->r_cpos),
  1379. le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
  1380. (unsigned long long)ref_leaf_bh->b_blocknr, index);
  1381. rf_list->rl_recs[index] = *rec;
  1382. le16_add_cpu(&rf_list->rl_used, 1);
  1383. ocfs2_refcount_rec_merge(rb, index);
  1384. ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
  1385. if (ret) {
  1386. mlog_errno(ret);
  1387. goto out;
  1388. }
  1389. if (index == 0) {
  1390. ret = ocfs2_adjust_refcount_rec(handle, ci,
  1391. ref_root_bh,
  1392. ref_leaf_bh, rec);
  1393. if (ret)
  1394. mlog_errno(ret);
  1395. }
  1396. out:
  1397. brelse(new_bh);
  1398. return ret;
  1399. }
  1400. /*
  1401. * Split the refcount_rec indexed by "index" in ref_leaf_bh.
  1402. * This is much simple than our b-tree code.
  1403. * split_rec is the new refcount rec we want to insert.
  1404. * If split_rec->r_refcount > 0, we are changing the refcount(in case we
  1405. * increase refcount or decrease a refcount to non-zero).
  1406. * If split_rec->r_refcount == 0, we are punching a hole in current refcount
  1407. * rec( in case we decrease a refcount to zero).
  1408. */
  1409. static int ocfs2_split_refcount_rec(handle_t *handle,
  1410. struct ocfs2_caching_info *ci,
  1411. struct buffer_head *ref_root_bh,
  1412. struct buffer_head *ref_leaf_bh,
  1413. struct ocfs2_refcount_rec *split_rec,
  1414. int index,
  1415. struct ocfs2_alloc_context *meta_ac,
  1416. struct ocfs2_cached_dealloc_ctxt *dealloc)
  1417. {
  1418. int ret, recs_need;
  1419. u32 len;
  1420. struct ocfs2_refcount_block *rb =
  1421. (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1422. struct ocfs2_refcount_list *rf_list = &rb->rf_records;
  1423. struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
  1424. struct ocfs2_refcount_rec *tail_rec = NULL;
  1425. struct buffer_head *new_bh = NULL;
  1426. BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
  1427. mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
  1428. le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
  1429. le64_to_cpu(split_rec->r_cpos),
  1430. le32_to_cpu(split_rec->r_clusters));
  1431. /*
  1432. * If we just need to split the header or tail clusters,
  1433. * no more recs are needed, just split is OK.
  1434. * Otherwise we at least need one new recs.
  1435. */
  1436. if (!split_rec->r_refcount &&
  1437. (split_rec->r_cpos == orig_rec->r_cpos ||
  1438. le64_to_cpu(split_rec->r_cpos) +
  1439. le32_to_cpu(split_rec->r_clusters) ==
  1440. le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
  1441. recs_need = 0;
  1442. else
  1443. recs_need = 1;
  1444. /*
  1445. * We need one more rec if we split in the middle and the new rec have
  1446. * some refcount in it.
  1447. */
  1448. if (split_rec->r_refcount &&
  1449. (split_rec->r_cpos != orig_rec->r_cpos &&
  1450. le64_to_cpu(split_rec->r_cpos) +
  1451. le32_to_cpu(split_rec->r_clusters) !=
  1452. le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
  1453. recs_need++;
  1454. /* If the leaf block don't have enough record, expand it. */
  1455. if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
  1456. struct ocfs2_refcount_rec tmp_rec;
  1457. u64 cpos = le64_to_cpu(orig_rec->r_cpos);
  1458. len = le32_to_cpu(orig_rec->r_clusters);
  1459. ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
  1460. ref_leaf_bh, meta_ac);
  1461. if (ret) {
  1462. mlog_errno(ret);
  1463. goto out;
  1464. }
  1465. /*
  1466. * We have to re-get it since now cpos may be moved to
  1467. * another leaf block.
  1468. */
  1469. ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
  1470. cpos, len, &tmp_rec, &index,
  1471. &new_bh);
  1472. if (ret) {
  1473. mlog_errno(ret);
  1474. goto out;
  1475. }
  1476. ref_leaf_bh = new_bh;
  1477. rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1478. rf_list = &rb->rf_records;
  1479. orig_rec = &rf_list->rl_recs[index];
  1480. }
  1481. ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
  1482. OCFS2_JOURNAL_ACCESS_WRITE);
  1483. if (ret) {
  1484. mlog_errno(ret);
  1485. goto out;
  1486. }
  1487. /*
  1488. * We have calculated out how many new records we need and store
  1489. * in recs_need, so spare enough space first by moving the records
  1490. * after "index" to the end.
  1491. */
  1492. if (index != le16_to_cpu(rf_list->rl_used) - 1)
  1493. memmove(&rf_list->rl_recs[index + 1 + recs_need],
  1494. &rf_list->rl_recs[index + 1],
  1495. (le16_to_cpu(rf_list->rl_used) - index - 1) *
  1496. sizeof(struct ocfs2_refcount_rec));
  1497. len = (le64_to_cpu(orig_rec->r_cpos) +
  1498. le32_to_cpu(orig_rec->r_clusters)) -
  1499. (le64_to_cpu(split_rec->r_cpos) +
  1500. le32_to_cpu(split_rec->r_clusters));
  1501. /*
  1502. * If we have "len", the we will split in the tail and move it
  1503. * to the end of the space we have just spared.
  1504. */
  1505. if (len) {
  1506. tail_rec = &rf_list->rl_recs[index + recs_need];
  1507. memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
  1508. le64_add_cpu(&tail_rec->r_cpos,
  1509. le32_to_cpu(tail_rec->r_clusters) - len);
  1510. tail_rec->r_clusters = le32_to_cpu(len);
  1511. }
  1512. /*
  1513. * If the split pos isn't the same as the original one, we need to
  1514. * split in the head.
  1515. *
  1516. * Note: We have the chance that split_rec.r_refcount = 0,
  1517. * recs_need = 0 and len > 0, which means we just cut the head from
  1518. * the orig_rec and in that case we have done some modification in
  1519. * orig_rec above, so the check for r_cpos is faked.
  1520. */
  1521. if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
  1522. len = le64_to_cpu(split_rec->r_cpos) -
  1523. le64_to_cpu(orig_rec->r_cpos);
  1524. orig_rec->r_clusters = cpu_to_le32(len);
  1525. index++;
  1526. }
  1527. le16_add_cpu(&rf_list->rl_used, recs_need);
  1528. if (split_rec->r_refcount) {
  1529. rf_list->rl_recs[index] = *split_rec;
  1530. mlog(0, "insert refcount record start %llu, len %u, count %u "
  1531. "to leaf block %llu at index %d\n",
  1532. (unsigned long long)le64_to_cpu(split_rec->r_cpos),
  1533. le32_to_cpu(split_rec->r_clusters),
  1534. le32_to_cpu(split_rec->r_refcount),
  1535. (unsigned long long)ref_leaf_bh->b_blocknr, index);
  1536. ocfs2_refcount_rec_merge(rb, index);
  1537. }
  1538. ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
  1539. if (ret)
  1540. mlog_errno(ret);
  1541. out:
  1542. brelse(new_bh);
  1543. return ret;
  1544. }
  1545. static int __ocfs2_increase_refcount(handle_t *handle,
  1546. struct ocfs2_caching_info *ci,
  1547. struct buffer_head *ref_root_bh,
  1548. u64 cpos, u32 len,
  1549. struct ocfs2_alloc_context *meta_ac,
  1550. struct ocfs2_cached_dealloc_ctxt *dealloc)
  1551. {
  1552. int ret = 0, index;
  1553. struct buffer_head *ref_leaf_bh = NULL;
  1554. struct ocfs2_refcount_rec rec;
  1555. unsigned int set_len = 0;
  1556. mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
  1557. (unsigned long long)ocfs2_metadata_cache_owner(ci),
  1558. (unsigned long long)cpos, len);
  1559. while (len) {
  1560. ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
  1561. cpos, len, &rec, &index,
  1562. &ref_leaf_bh);
  1563. if (ret) {
  1564. mlog_errno(ret);
  1565. goto out;
  1566. }
  1567. set_len = le32_to_cpu(rec.r_clusters);
  1568. /*
  1569. * Here we may meet with 3 situations:
  1570. *
  1571. * 1. If we find an already existing record, and the length
  1572. * is the same, cool, we just need to increase the r_refcount
  1573. * and it is OK.
  1574. * 2. If we find a hole, just insert it with r_refcount = 1.
  1575. * 3. If we are in the middle of one extent record, split
  1576. * it.
  1577. */
  1578. if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
  1579. set_len <= len) {
  1580. mlog(0, "increase refcount rec, start %llu, len %u, "
  1581. "count %u\n", (unsigned long long)cpos, set_len,
  1582. le32_to_cpu(rec.r_refcount));
  1583. ret = ocfs2_change_refcount_rec(handle, ci,
  1584. ref_leaf_bh, index, 1);
  1585. if (ret) {
  1586. mlog_errno(ret);
  1587. goto out;
  1588. }
  1589. } else if (!rec.r_refcount) {
  1590. rec.r_refcount = cpu_to_le32(1);
  1591. mlog(0, "insert refcount rec, start %llu, len %u\n",
  1592. (unsigned long long)le64_to_cpu(rec.r_cpos),
  1593. set_len);
  1594. ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
  1595. ref_leaf_bh,
  1596. &rec, index, meta_ac);
  1597. if (ret) {
  1598. mlog_errno(ret);
  1599. goto out;
  1600. }
  1601. } else {
  1602. set_len = min((u64)(cpos + len),
  1603. le64_to_cpu(rec.r_cpos) + set_len) - cpos;
  1604. rec.r_cpos = cpu_to_le64(cpos);
  1605. rec.r_clusters = cpu_to_le32(set_len);
  1606. le32_add_cpu(&rec.r_refcount, 1);
  1607. mlog(0, "split refcount rec, start %llu, "
  1608. "len %u, count %u\n",
  1609. (unsigned long long)le64_to_cpu(rec.r_cpos),
  1610. set_len, le32_to_cpu(rec.r_refcount));
  1611. ret = ocfs2_split_refcount_rec(handle, ci,
  1612. ref_root_bh, ref_leaf_bh,
  1613. &rec, index,
  1614. meta_ac, dealloc);
  1615. if (ret) {
  1616. mlog_errno(ret);
  1617. goto out;
  1618. }
  1619. }
  1620. cpos += set_len;
  1621. len -= set_len;
  1622. brelse(ref_leaf_bh);
  1623. ref_leaf_bh = NULL;
  1624. }
  1625. out:
  1626. brelse(ref_leaf_bh);
  1627. return ret;
  1628. }
  1629. static int ocfs2_remove_refcount_extent(handle_t *handle,
  1630. struct ocfs2_caching_info *ci,
  1631. struct buffer_head *ref_root_bh,
  1632. struct buffer_head *ref_leaf_bh,
  1633. struct ocfs2_alloc_context *meta_ac,
  1634. struct ocfs2_cached_dealloc_ctxt *dealloc)
  1635. {
  1636. int ret;
  1637. struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
  1638. struct ocfs2_refcount_block *rb =
  1639. (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1640. struct ocfs2_extent_tree et;
  1641. BUG_ON(rb->rf_records.rl_used);
  1642. ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
  1643. ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
  1644. 1, meta_ac, dealloc);
  1645. if (ret) {
  1646. mlog_errno(ret);
  1647. goto out;
  1648. }
  1649. ocfs2_remove_from_cache(ci, ref_leaf_bh);
  1650. /*
  1651. * add the freed block to the dealloc so that it will be freed
  1652. * when we run dealloc.
  1653. */
  1654. ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
  1655. le16_to_cpu(rb->rf_suballoc_slot),
  1656. le64_to_cpu(rb->rf_blkno),
  1657. le16_to_cpu(rb->rf_suballoc_bit));
  1658. if (ret) {
  1659. mlog_errno(ret);
  1660. goto out;
  1661. }
  1662. ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
  1663. OCFS2_JOURNAL_ACCESS_WRITE);
  1664. if (ret) {
  1665. mlog_errno(ret);
  1666. goto out;
  1667. }
  1668. rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  1669. le32_add_cpu(&rb->rf_clusters, -1);
  1670. /*
  1671. * check whether we need to restore the root refcount block if
  1672. * there is no leaf extent block at atll.
  1673. */
  1674. if (!rb->rf_list.l_next_free_rec) {
  1675. BUG_ON(rb->rf_clusters);
  1676. mlog(0, "reset refcount tree root %llu to be a record block.\n",
  1677. (unsigned long long)ref_root_bh->b_blocknr);
  1678. rb->rf_flags = 0;
  1679. rb->rf_parent = 0;
  1680. rb->rf_cpos = 0;
  1681. memset(&rb->rf_records, 0, sb->s_blocksize -
  1682. offsetof(struct ocfs2_refcount_block, rf_records));
  1683. rb->rf_records.rl_count =
  1684. cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
  1685. }
  1686. ocfs2_journal_dirty(handle, ref_root_bh);
  1687. out:
  1688. return ret;
  1689. }
  1690. static int ocfs2_decrease_refcount_rec(handle_t *handle,
  1691. struct ocfs2_caching_info *ci,
  1692. struct buffer_head *ref_root_bh,
  1693. struct buffer_head *ref_leaf_bh,
  1694. int index, u64 cpos, unsigned int len,
  1695. struct ocfs2_alloc_context *meta_ac,
  1696. struct ocfs2_cached_dealloc_ctxt *dealloc)
  1697. {
  1698. int ret;
  1699. struct ocfs2_refcount_block *rb =
  1700. (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
  1701. struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
  1702. BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
  1703. BUG_ON(cpos + len >
  1704. le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
  1705. if (cpos == le64_to_cpu(rec->r_cpos) &&
  1706. len == le32_to_cpu(rec->r_clusters))
  1707. ret = ocfs2_change_refcount_rec(handle, ci,
  1708. ref_leaf_bh, index, -1);
  1709. else {
  1710. struct ocfs2_refcount_rec split = *rec;
  1711. split.r_cpos = cpu_to_le64(cpos);
  1712. split.r_clusters = cpu_to_le32(len);
  1713. le32_add_cpu(&split.r_refcount, -1);
  1714. mlog(0, "split refcount rec, start %llu, "
  1715. "len %u, count %u, original start %llu, len %u\n",
  1716. (unsigned long long)le64_to_cpu(split.r_cpos),
  1717. len, le32_to_cpu(split.r_refcount),
  1718. (unsigned long long)le64_to_cpu(rec->r_cpos),
  1719. le32_to_cpu(rec->r_clusters));
  1720. ret = ocfs2_split_refcount_rec(handle, ci,
  1721. ref_root_bh, ref_leaf_bh,
  1722. &split, index,
  1723. meta_ac, dealloc);
  1724. }
  1725. if (ret) {
  1726. mlog_errno(ret);
  1727. goto out;
  1728. }
  1729. /* Remove the leaf refcount block if it contains no refcount record. */
  1730. if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
  1731. ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
  1732. ref_leaf_bh, meta_ac,
  1733. dealloc);
  1734. if (ret)
  1735. mlog_errno(ret);
  1736. }
  1737. out:
  1738. return ret;
  1739. }
  1740. static int __ocfs2_decrease_refcount(handle_t *handle,
  1741. struct ocfs2_caching_info *ci,
  1742. struct buffer_head *ref_root_bh,
  1743. u64 cpos, u32 len,
  1744. struct ocfs2_alloc_context *meta_ac,
  1745. struct ocfs2_cached_dealloc_ctxt *dealloc)
  1746. {
  1747. int ret = 0, index = 0;
  1748. struct ocfs2_refcount_rec rec;
  1749. unsigned int r_count = 0, r_len;
  1750. struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
  1751. struct buffer_head *ref_leaf_bh = NULL;
  1752. mlog(0, "Tree owner %llu, decrease refcount start %llu, len %u\n",
  1753. (unsigned long long)ocfs2_metadata_cache_owner(ci),
  1754. (unsigned long long)cpos, len);
  1755. while (len) {
  1756. ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
  1757. cpos, len, &rec, &index,
  1758. &ref_leaf_bh);
  1759. if (ret) {
  1760. mlog_errno(ret);
  1761. goto out;
  1762. }
  1763. r_count = le32_to_cpu(rec.r_refcount);
  1764. BUG_ON(r_count == 0);
  1765. r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
  1766. le32_to_cpu(rec.r_clusters)) - cpos;
  1767. ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
  1768. ref_leaf_bh, index,
  1769. cpos, r_len,
  1770. meta_ac, dealloc);
  1771. if (ret) {
  1772. mlog_errno(ret);
  1773. goto out;
  1774. }
  1775. if (le32_to_cpu(rec.r_refcount) == 1) {
  1776. ret = ocfs2_cache_cluster_dealloc(dealloc,
  1777. ocfs2_clusters_to_blocks(sb, cpos),
  1778. r_len);
  1779. if (ret) {
  1780. mlog_errno(ret);
  1781. goto out;
  1782. }
  1783. }
  1784. cpos += r_len;
  1785. len -= r_len;
  1786. brelse(ref_leaf_bh);
  1787. ref_leaf_bh = NULL;
  1788. }
  1789. out:
  1790. brelse(ref_leaf_bh);
  1791. return ret;
  1792. }
  1793. /* Caller must hold refcount tree lock. */
  1794. int ocfs2_decrease_refcount(struct inode *inode,
  1795. handle_t *handle, u32 cpos, u32 len,
  1796. struct ocfs2_alloc_context *meta_ac,
  1797. struct ocfs2_cached_dealloc_ctxt *dealloc)
  1798. {
  1799. int ret;
  1800. u64 ref_blkno;
  1801. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  1802. struct buffer_head *ref_root_bh = NULL;
  1803. struct ocfs2_refcount_tree *tree;
  1804. BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
  1805. ret = ocfs2_get_refcount_block(inode, &ref_blkno);
  1806. if (ret) {
  1807. mlog_errno(ret);
  1808. goto out;
  1809. }
  1810. ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
  1811. if (ret) {
  1812. mlog_errno(ret);
  1813. goto out;
  1814. }
  1815. ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
  1816. &ref_root_bh);
  1817. if (ret) {
  1818. mlog_errno(ret);
  1819. goto out;
  1820. }
  1821. ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
  1822. cpos, len, meta_ac, dealloc);
  1823. if (ret)
  1824. mlog_errno(ret);
  1825. out:
  1826. brelse(ref_root_bh);
  1827. return ret;
  1828. }