alloc.c 91 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626
  1. /* -*- mode: c; c-basic-offset: 8; -*-
  2. * vim: noexpandtab sw=8 ts=8 sts=0:
  3. *
  4. * alloc.c
  5. *
  6. * Extent allocs and frees
  7. *
  8. * Copyright (C) 2002, 2004 Oracle. All rights reserved.
  9. *
  10. * This program is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2 of the License, or (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public
  21. * License along with this program; if not, write to the
  22. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23. * Boston, MA 021110-1307, USA.
  24. */
  25. #include <linux/fs.h>
  26. #include <linux/types.h>
  27. #include <linux/slab.h>
  28. #include <linux/highmem.h>
  29. #define MLOG_MASK_PREFIX ML_DISK_ALLOC
  30. #include <cluster/masklog.h>
  31. #include "ocfs2.h"
  32. #include "alloc.h"
  33. #include "dlmglue.h"
  34. #include "extent_map.h"
  35. #include "inode.h"
  36. #include "journal.h"
  37. #include "localalloc.h"
  38. #include "suballoc.h"
  39. #include "sysfile.h"
  40. #include "file.h"
  41. #include "super.h"
  42. #include "uptodate.h"
  43. #include "buffer_head_io.h"
  44. static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
  45. /*
  46. * Structures which describe a path through a btree, and functions to
  47. * manipulate them.
  48. *
  49. * The idea here is to be as generic as possible with the tree
  50. * manipulation code.
  51. */
  52. struct ocfs2_path_item {
  53. struct buffer_head *bh;
  54. struct ocfs2_extent_list *el;
  55. };
  56. #define OCFS2_MAX_PATH_DEPTH 5
  57. struct ocfs2_path {
  58. int p_tree_depth;
  59. struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
  60. };
  61. #define path_root_bh(_path) ((_path)->p_node[0].bh)
  62. #define path_root_el(_path) ((_path)->p_node[0].el)
  63. #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
  64. #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
  65. #define path_num_items(_path) ((_path)->p_tree_depth + 1)
  66. /*
  67. * Reset the actual path elements so that we can re-use the structure
  68. * to build another path. Generally, this involves freeing the buffer
  69. * heads.
  70. */
  71. static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
  72. {
  73. int i, start = 0, depth = 0;
  74. struct ocfs2_path_item *node;
  75. if (keep_root)
  76. start = 1;
  77. for(i = start; i < path_num_items(path); i++) {
  78. node = &path->p_node[i];
  79. brelse(node->bh);
  80. node->bh = NULL;
  81. node->el = NULL;
  82. }
  83. /*
  84. * Tree depth may change during truncate, or insert. If we're
  85. * keeping the root extent list, then make sure that our path
  86. * structure reflects the proper depth.
  87. */
  88. if (keep_root)
  89. depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
  90. path->p_tree_depth = depth;
  91. }
  92. static void ocfs2_free_path(struct ocfs2_path *path)
  93. {
  94. if (path) {
  95. ocfs2_reinit_path(path, 0);
  96. kfree(path);
  97. }
  98. }
  99. /*
  100. * Make the *dest path the same as src and re-initialize src path to
  101. * have a root only.
  102. */
  103. static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
  104. {
  105. int i;
  106. BUG_ON(path_root_bh(dest) != path_root_bh(src));
  107. for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
  108. brelse(dest->p_node[i].bh);
  109. dest->p_node[i].bh = src->p_node[i].bh;
  110. dest->p_node[i].el = src->p_node[i].el;
  111. src->p_node[i].bh = NULL;
  112. src->p_node[i].el = NULL;
  113. }
  114. }
  115. /*
  116. * Insert an extent block at given index.
  117. *
  118. * This will not take an additional reference on eb_bh.
  119. */
  120. static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
  121. struct buffer_head *eb_bh)
  122. {
  123. struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
  124. /*
  125. * Right now, no root bh is an extent block, so this helps
  126. * catch code errors with dinode trees. The assertion can be
  127. * safely removed if we ever need to insert extent block
  128. * structures at the root.
  129. */
  130. BUG_ON(index == 0);
  131. path->p_node[index].bh = eb_bh;
  132. path->p_node[index].el = &eb->h_list;
  133. }
  134. static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
  135. struct ocfs2_extent_list *root_el)
  136. {
  137. struct ocfs2_path *path;
  138. BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
  139. path = kzalloc(sizeof(*path), GFP_NOFS);
  140. if (path) {
  141. path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
  142. get_bh(root_bh);
  143. path_root_bh(path) = root_bh;
  144. path_root_el(path) = root_el;
  145. }
  146. return path;
  147. }
  148. /*
  149. * Allocate and initialize a new path based on a disk inode tree.
  150. */
  151. static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
  152. {
  153. struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
  154. struct ocfs2_extent_list *el = &di->id2.i_list;
  155. return ocfs2_new_path(di_bh, el);
  156. }
  157. /*
  158. * Convenience function to journal all components in a path.
  159. */
  160. static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
  161. struct ocfs2_path *path)
  162. {
  163. int i, ret = 0;
  164. if (!path)
  165. goto out;
  166. for(i = 0; i < path_num_items(path); i++) {
  167. ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
  168. OCFS2_JOURNAL_ACCESS_WRITE);
  169. if (ret < 0) {
  170. mlog_errno(ret);
  171. goto out;
  172. }
  173. }
  174. out:
  175. return ret;
  176. }
  177. enum ocfs2_contig_type {
  178. CONTIG_NONE = 0,
  179. CONTIG_LEFT,
  180. CONTIG_RIGHT
  181. };
  182. static int ocfs2_block_extent_contig(struct super_block *sb,
  183. struct ocfs2_extent_rec *ext,
  184. u64 blkno)
  185. {
  186. return blkno == (le64_to_cpu(ext->e_blkno) +
  187. ocfs2_clusters_to_blocks(sb,
  188. le32_to_cpu(ext->e_clusters)));
  189. }
  190. static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
  191. struct ocfs2_extent_rec *right)
  192. {
  193. return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
  194. le32_to_cpu(right->e_cpos));
  195. }
  196. static enum ocfs2_contig_type
  197. ocfs2_extent_contig(struct inode *inode,
  198. struct ocfs2_extent_rec *ext,
  199. struct ocfs2_extent_rec *insert_rec)
  200. {
  201. u64 blkno = le64_to_cpu(insert_rec->e_blkno);
  202. if (ocfs2_extents_adjacent(ext, insert_rec) &&
  203. ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
  204. return CONTIG_RIGHT;
  205. blkno = le64_to_cpu(ext->e_blkno);
  206. if (ocfs2_extents_adjacent(insert_rec, ext) &&
  207. ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
  208. return CONTIG_LEFT;
  209. return CONTIG_NONE;
  210. }
  211. /*
  212. * NOTE: We can have pretty much any combination of contiguousness and
  213. * appending.
  214. *
  215. * The usefulness of APPEND_TAIL is more in that it lets us know that
  216. * we'll have to update the path to that leaf.
  217. */
  218. enum ocfs2_append_type {
  219. APPEND_NONE = 0,
  220. APPEND_TAIL,
  221. };
  222. struct ocfs2_insert_type {
  223. enum ocfs2_append_type ins_appending;
  224. enum ocfs2_contig_type ins_contig;
  225. int ins_contig_index;
  226. int ins_free_records;
  227. int ins_tree_depth;
  228. };
  229. /*
  230. * How many free extents have we got before we need more meta data?
  231. */
  232. int ocfs2_num_free_extents(struct ocfs2_super *osb,
  233. struct inode *inode,
  234. struct ocfs2_dinode *fe)
  235. {
  236. int retval;
  237. struct ocfs2_extent_list *el;
  238. struct ocfs2_extent_block *eb;
  239. struct buffer_head *eb_bh = NULL;
  240. mlog_entry_void();
  241. if (!OCFS2_IS_VALID_DINODE(fe)) {
  242. OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
  243. retval = -EIO;
  244. goto bail;
  245. }
  246. if (fe->i_last_eb_blk) {
  247. retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
  248. &eb_bh, OCFS2_BH_CACHED, inode);
  249. if (retval < 0) {
  250. mlog_errno(retval);
  251. goto bail;
  252. }
  253. eb = (struct ocfs2_extent_block *) eb_bh->b_data;
  254. el = &eb->h_list;
  255. } else
  256. el = &fe->id2.i_list;
  257. BUG_ON(el->l_tree_depth != 0);
  258. retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
  259. bail:
  260. if (eb_bh)
  261. brelse(eb_bh);
  262. mlog_exit(retval);
  263. return retval;
  264. }
  265. /* expects array to already be allocated
  266. *
  267. * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
  268. * l_count for you
  269. */
  270. static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
  271. handle_t *handle,
  272. struct inode *inode,
  273. int wanted,
  274. struct ocfs2_alloc_context *meta_ac,
  275. struct buffer_head *bhs[])
  276. {
  277. int count, status, i;
  278. u16 suballoc_bit_start;
  279. u32 num_got;
  280. u64 first_blkno;
  281. struct ocfs2_extent_block *eb;
  282. mlog_entry_void();
  283. count = 0;
  284. while (count < wanted) {
  285. status = ocfs2_claim_metadata(osb,
  286. handle,
  287. meta_ac,
  288. wanted - count,
  289. &suballoc_bit_start,
  290. &num_got,
  291. &first_blkno);
  292. if (status < 0) {
  293. mlog_errno(status);
  294. goto bail;
  295. }
  296. for(i = count; i < (num_got + count); i++) {
  297. bhs[i] = sb_getblk(osb->sb, first_blkno);
  298. if (bhs[i] == NULL) {
  299. status = -EIO;
  300. mlog_errno(status);
  301. goto bail;
  302. }
  303. ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
  304. status = ocfs2_journal_access(handle, inode, bhs[i],
  305. OCFS2_JOURNAL_ACCESS_CREATE);
  306. if (status < 0) {
  307. mlog_errno(status);
  308. goto bail;
  309. }
  310. memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
  311. eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
  312. /* Ok, setup the minimal stuff here. */
  313. strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
  314. eb->h_blkno = cpu_to_le64(first_blkno);
  315. eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
  316. #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
  317. /* we always use slot zero's suballocator */
  318. eb->h_suballoc_slot = 0;
  319. #else
  320. eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
  321. #endif
  322. eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
  323. eb->h_list.l_count =
  324. cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
  325. suballoc_bit_start++;
  326. first_blkno++;
  327. /* We'll also be dirtied by the caller, so
  328. * this isn't absolutely necessary. */
  329. status = ocfs2_journal_dirty(handle, bhs[i]);
  330. if (status < 0) {
  331. mlog_errno(status);
  332. goto bail;
  333. }
  334. }
  335. count += num_got;
  336. }
  337. status = 0;
  338. bail:
  339. if (status < 0) {
  340. for(i = 0; i < wanted; i++) {
  341. if (bhs[i])
  342. brelse(bhs[i]);
  343. bhs[i] = NULL;
  344. }
  345. }
  346. mlog_exit(status);
  347. return status;
  348. }
  349. /*
  350. * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
  351. *
  352. * Returns the sum of the rightmost extent rec logical offset and
  353. * cluster count.
  354. *
  355. * ocfs2_add_branch() uses this to determine what logical cluster
  356. * value should be populated into the leftmost new branch records.
  357. *
  358. * ocfs2_shift_tree_depth() uses this to determine the # clusters
  359. * value for the new topmost tree record.
  360. */
  361. static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
  362. {
  363. int i;
  364. i = le16_to_cpu(el->l_next_free_rec) - 1;
  365. return le32_to_cpu(el->l_recs[i].e_cpos) +
  366. le32_to_cpu(el->l_recs[i].e_clusters);
  367. }
  368. /*
  369. * Add an entire tree branch to our inode. eb_bh is the extent block
  370. * to start at, if we don't want to start the branch at the dinode
  371. * structure.
  372. *
  373. * last_eb_bh is required as we have to update it's next_leaf pointer
  374. * for the new last extent block.
  375. *
  376. * the new branch will be 'empty' in the sense that every block will
  377. * contain a single record with e_clusters == 0.
  378. */
  379. static int ocfs2_add_branch(struct ocfs2_super *osb,
  380. handle_t *handle,
  381. struct inode *inode,
  382. struct buffer_head *fe_bh,
  383. struct buffer_head *eb_bh,
  384. struct buffer_head *last_eb_bh,
  385. struct ocfs2_alloc_context *meta_ac)
  386. {
  387. int status, new_blocks, i;
  388. u64 next_blkno, new_last_eb_blk;
  389. struct buffer_head *bh;
  390. struct buffer_head **new_eb_bhs = NULL;
  391. struct ocfs2_dinode *fe;
  392. struct ocfs2_extent_block *eb;
  393. struct ocfs2_extent_list *eb_el;
  394. struct ocfs2_extent_list *el;
  395. u32 new_cpos;
  396. mlog_entry_void();
  397. BUG_ON(!last_eb_bh);
  398. fe = (struct ocfs2_dinode *) fe_bh->b_data;
  399. if (eb_bh) {
  400. eb = (struct ocfs2_extent_block *) eb_bh->b_data;
  401. el = &eb->h_list;
  402. } else
  403. el = &fe->id2.i_list;
  404. /* we never add a branch to a leaf. */
  405. BUG_ON(!el->l_tree_depth);
  406. new_blocks = le16_to_cpu(el->l_tree_depth);
  407. /* allocate the number of new eb blocks we need */
  408. new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
  409. GFP_KERNEL);
  410. if (!new_eb_bhs) {
  411. status = -ENOMEM;
  412. mlog_errno(status);
  413. goto bail;
  414. }
  415. status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
  416. meta_ac, new_eb_bhs);
  417. if (status < 0) {
  418. mlog_errno(status);
  419. goto bail;
  420. }
  421. eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
  422. new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
  423. /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
  424. * linked with the rest of the tree.
  425. * conversly, new_eb_bhs[0] is the new bottommost leaf.
  426. *
  427. * when we leave the loop, new_last_eb_blk will point to the
  428. * newest leaf, and next_blkno will point to the topmost extent
  429. * block. */
  430. next_blkno = new_last_eb_blk = 0;
  431. for(i = 0; i < new_blocks; i++) {
  432. bh = new_eb_bhs[i];
  433. eb = (struct ocfs2_extent_block *) bh->b_data;
  434. if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
  435. OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
  436. status = -EIO;
  437. goto bail;
  438. }
  439. eb_el = &eb->h_list;
  440. status = ocfs2_journal_access(handle, inode, bh,
  441. OCFS2_JOURNAL_ACCESS_CREATE);
  442. if (status < 0) {
  443. mlog_errno(status);
  444. goto bail;
  445. }
  446. eb->h_next_leaf_blk = 0;
  447. eb_el->l_tree_depth = cpu_to_le16(i);
  448. eb_el->l_next_free_rec = cpu_to_le16(1);
  449. /*
  450. * This actually counts as an empty extent as
  451. * c_clusters == 0
  452. */
  453. eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
  454. eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
  455. eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
  456. if (!eb_el->l_tree_depth)
  457. new_last_eb_blk = le64_to_cpu(eb->h_blkno);
  458. status = ocfs2_journal_dirty(handle, bh);
  459. if (status < 0) {
  460. mlog_errno(status);
  461. goto bail;
  462. }
  463. next_blkno = le64_to_cpu(eb->h_blkno);
  464. }
  465. /* This is a bit hairy. We want to update up to three blocks
  466. * here without leaving any of them in an inconsistent state
  467. * in case of error. We don't have to worry about
  468. * journal_dirty erroring as it won't unless we've aborted the
  469. * handle (in which case we would never be here) so reserving
  470. * the write with journal_access is all we need to do. */
  471. status = ocfs2_journal_access(handle, inode, last_eb_bh,
  472. OCFS2_JOURNAL_ACCESS_WRITE);
  473. if (status < 0) {
  474. mlog_errno(status);
  475. goto bail;
  476. }
  477. status = ocfs2_journal_access(handle, inode, fe_bh,
  478. OCFS2_JOURNAL_ACCESS_WRITE);
  479. if (status < 0) {
  480. mlog_errno(status);
  481. goto bail;
  482. }
  483. if (eb_bh) {
  484. status = ocfs2_journal_access(handle, inode, eb_bh,
  485. OCFS2_JOURNAL_ACCESS_WRITE);
  486. if (status < 0) {
  487. mlog_errno(status);
  488. goto bail;
  489. }
  490. }
  491. /* Link the new branch into the rest of the tree (el will
  492. * either be on the fe, or the extent block passed in. */
  493. i = le16_to_cpu(el->l_next_free_rec);
  494. el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
  495. el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
  496. el->l_recs[i].e_clusters = 0;
  497. le16_add_cpu(&el->l_next_free_rec, 1);
  498. /* fe needs a new last extent block pointer, as does the
  499. * next_leaf on the previously last-extent-block. */
  500. fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
  501. eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
  502. eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
  503. status = ocfs2_journal_dirty(handle, last_eb_bh);
  504. if (status < 0)
  505. mlog_errno(status);
  506. status = ocfs2_journal_dirty(handle, fe_bh);
  507. if (status < 0)
  508. mlog_errno(status);
  509. if (eb_bh) {
  510. status = ocfs2_journal_dirty(handle, eb_bh);
  511. if (status < 0)
  512. mlog_errno(status);
  513. }
  514. status = 0;
  515. bail:
  516. if (new_eb_bhs) {
  517. for (i = 0; i < new_blocks; i++)
  518. if (new_eb_bhs[i])
  519. brelse(new_eb_bhs[i]);
  520. kfree(new_eb_bhs);
  521. }
  522. mlog_exit(status);
  523. return status;
  524. }
  525. /*
  526. * adds another level to the allocation tree.
  527. * returns back the new extent block so you can add a branch to it
  528. * after this call.
  529. */
  530. static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
  531. handle_t *handle,
  532. struct inode *inode,
  533. struct buffer_head *fe_bh,
  534. struct ocfs2_alloc_context *meta_ac,
  535. struct buffer_head **ret_new_eb_bh)
  536. {
  537. int status, i;
  538. u32 new_clusters;
  539. struct buffer_head *new_eb_bh = NULL;
  540. struct ocfs2_dinode *fe;
  541. struct ocfs2_extent_block *eb;
  542. struct ocfs2_extent_list *fe_el;
  543. struct ocfs2_extent_list *eb_el;
  544. mlog_entry_void();
  545. status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
  546. &new_eb_bh);
  547. if (status < 0) {
  548. mlog_errno(status);
  549. goto bail;
  550. }
  551. eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
  552. if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
  553. OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
  554. status = -EIO;
  555. goto bail;
  556. }
  557. eb_el = &eb->h_list;
  558. fe = (struct ocfs2_dinode *) fe_bh->b_data;
  559. fe_el = &fe->id2.i_list;
  560. status = ocfs2_journal_access(handle, inode, new_eb_bh,
  561. OCFS2_JOURNAL_ACCESS_CREATE);
  562. if (status < 0) {
  563. mlog_errno(status);
  564. goto bail;
  565. }
  566. /* copy the fe data into the new extent block */
  567. eb_el->l_tree_depth = fe_el->l_tree_depth;
  568. eb_el->l_next_free_rec = fe_el->l_next_free_rec;
  569. for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
  570. eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
  571. eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
  572. eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
  573. }
  574. status = ocfs2_journal_dirty(handle, new_eb_bh);
  575. if (status < 0) {
  576. mlog_errno(status);
  577. goto bail;
  578. }
  579. status = ocfs2_journal_access(handle, inode, fe_bh,
  580. OCFS2_JOURNAL_ACCESS_WRITE);
  581. if (status < 0) {
  582. mlog_errno(status);
  583. goto bail;
  584. }
  585. new_clusters = ocfs2_sum_rightmost_rec(eb_el);
  586. /* update fe now */
  587. le16_add_cpu(&fe_el->l_tree_depth, 1);
  588. fe_el->l_recs[0].e_cpos = 0;
  589. fe_el->l_recs[0].e_blkno = eb->h_blkno;
  590. fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
  591. for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
  592. fe_el->l_recs[i].e_cpos = 0;
  593. fe_el->l_recs[i].e_clusters = 0;
  594. fe_el->l_recs[i].e_blkno = 0;
  595. }
  596. fe_el->l_next_free_rec = cpu_to_le16(1);
  597. /* If this is our 1st tree depth shift, then last_eb_blk
  598. * becomes the allocated extent block */
  599. if (fe_el->l_tree_depth == cpu_to_le16(1))
  600. fe->i_last_eb_blk = eb->h_blkno;
  601. status = ocfs2_journal_dirty(handle, fe_bh);
  602. if (status < 0) {
  603. mlog_errno(status);
  604. goto bail;
  605. }
  606. *ret_new_eb_bh = new_eb_bh;
  607. new_eb_bh = NULL;
  608. status = 0;
  609. bail:
  610. if (new_eb_bh)
  611. brelse(new_eb_bh);
  612. mlog_exit(status);
  613. return status;
  614. }
  615. /*
  616. * Should only be called when there is no space left in any of the
  617. * leaf nodes. What we want to do is find the lowest tree depth
  618. * non-leaf extent block with room for new records. There are three
  619. * valid results of this search:
  620. *
  621. * 1) a lowest extent block is found, then we pass it back in
  622. * *lowest_eb_bh and return '0'
  623. *
  624. * 2) the search fails to find anything, but the dinode has room. We
  625. * pass NULL back in *lowest_eb_bh, but still return '0'
  626. *
  627. * 3) the search fails to find anything AND the dinode is full, in
  628. * which case we return > 0
  629. *
  630. * return status < 0 indicates an error.
  631. */
  632. static int ocfs2_find_branch_target(struct ocfs2_super *osb,
  633. struct inode *inode,
  634. struct buffer_head *fe_bh,
  635. struct buffer_head **target_bh)
  636. {
  637. int status = 0, i;
  638. u64 blkno;
  639. struct ocfs2_dinode *fe;
  640. struct ocfs2_extent_block *eb;
  641. struct ocfs2_extent_list *el;
  642. struct buffer_head *bh = NULL;
  643. struct buffer_head *lowest_bh = NULL;
  644. mlog_entry_void();
  645. *target_bh = NULL;
  646. fe = (struct ocfs2_dinode *) fe_bh->b_data;
  647. el = &fe->id2.i_list;
  648. while(le16_to_cpu(el->l_tree_depth) > 1) {
  649. if (le16_to_cpu(el->l_next_free_rec) == 0) {
  650. ocfs2_error(inode->i_sb, "Dinode %llu has empty "
  651. "extent list (next_free_rec == 0)",
  652. (unsigned long long)OCFS2_I(inode)->ip_blkno);
  653. status = -EIO;
  654. goto bail;
  655. }
  656. i = le16_to_cpu(el->l_next_free_rec) - 1;
  657. blkno = le64_to_cpu(el->l_recs[i].e_blkno);
  658. if (!blkno) {
  659. ocfs2_error(inode->i_sb, "Dinode %llu has extent "
  660. "list where extent # %d has no physical "
  661. "block start",
  662. (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
  663. status = -EIO;
  664. goto bail;
  665. }
  666. if (bh) {
  667. brelse(bh);
  668. bh = NULL;
  669. }
  670. status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
  671. inode);
  672. if (status < 0) {
  673. mlog_errno(status);
  674. goto bail;
  675. }
  676. eb = (struct ocfs2_extent_block *) bh->b_data;
  677. if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
  678. OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
  679. status = -EIO;
  680. goto bail;
  681. }
  682. el = &eb->h_list;
  683. if (le16_to_cpu(el->l_next_free_rec) <
  684. le16_to_cpu(el->l_count)) {
  685. if (lowest_bh)
  686. brelse(lowest_bh);
  687. lowest_bh = bh;
  688. get_bh(lowest_bh);
  689. }
  690. }
  691. /* If we didn't find one and the fe doesn't have any room,
  692. * then return '1' */
  693. if (!lowest_bh
  694. && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
  695. status = 1;
  696. *target_bh = lowest_bh;
  697. bail:
  698. if (bh)
  699. brelse(bh);
  700. mlog_exit(status);
  701. return status;
  702. }
  703. static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
  704. {
  705. return !rec->e_clusters;
  706. }
  707. /*
  708. * This function will discard the rightmost extent record.
  709. */
  710. static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
  711. {
  712. int next_free = le16_to_cpu(el->l_next_free_rec);
  713. int count = le16_to_cpu(el->l_count);
  714. unsigned int num_bytes;
  715. BUG_ON(!next_free);
  716. /* This will cause us to go off the end of our extent list. */
  717. BUG_ON(next_free >= count);
  718. num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
  719. memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
  720. }
  721. static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
  722. struct ocfs2_extent_rec *insert_rec)
  723. {
  724. int i, insert_index, next_free, has_empty, num_bytes;
  725. u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
  726. struct ocfs2_extent_rec *rec;
  727. next_free = le16_to_cpu(el->l_next_free_rec);
  728. has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
  729. BUG_ON(!next_free);
  730. /* The tree code before us didn't allow enough room in the leaf. */
  731. if (el->l_next_free_rec == el->l_count && !has_empty)
  732. BUG();
  733. /*
  734. * The easiest way to approach this is to just remove the
  735. * empty extent and temporarily decrement next_free.
  736. */
  737. if (has_empty) {
  738. /*
  739. * If next_free was 1 (only an empty extent), this
  740. * loop won't execute, which is fine. We still want
  741. * the decrement above to happen.
  742. */
  743. for(i = 0; i < (next_free - 1); i++)
  744. el->l_recs[i] = el->l_recs[i+1];
  745. next_free--;
  746. }
  747. /*
  748. * Figure out what the new record index should be.
  749. */
  750. for(i = 0; i < next_free; i++) {
  751. rec = &el->l_recs[i];
  752. if (insert_cpos < le32_to_cpu(rec->e_cpos))
  753. break;
  754. }
  755. insert_index = i;
  756. mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
  757. insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
  758. BUG_ON(insert_index < 0);
  759. BUG_ON(insert_index >= le16_to_cpu(el->l_count));
  760. BUG_ON(insert_index > next_free);
  761. /*
  762. * No need to memmove if we're just adding to the tail.
  763. */
  764. if (insert_index != next_free) {
  765. BUG_ON(next_free >= le16_to_cpu(el->l_count));
  766. num_bytes = next_free - insert_index;
  767. num_bytes *= sizeof(struct ocfs2_extent_rec);
  768. memmove(&el->l_recs[insert_index + 1],
  769. &el->l_recs[insert_index],
  770. num_bytes);
  771. }
  772. /*
  773. * Either we had an empty extent, and need to re-increment or
  774. * there was no empty extent on a non full rightmost leaf node,
  775. * in which case we still need to increment.
  776. */
  777. next_free++;
  778. el->l_next_free_rec = cpu_to_le16(next_free);
  779. /*
  780. * Make sure none of the math above just messed up our tree.
  781. */
  782. BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
  783. el->l_recs[insert_index] = *insert_rec;
  784. }
  785. /*
  786. * Create an empty extent record .
  787. *
  788. * l_next_free_rec may be updated.
  789. *
  790. * If an empty extent already exists do nothing.
  791. */
  792. static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
  793. {
  794. int next_free = le16_to_cpu(el->l_next_free_rec);
  795. if (next_free == 0)
  796. goto set_and_inc;
  797. if (ocfs2_is_empty_extent(&el->l_recs[0]))
  798. return;
  799. mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
  800. "Asked to create an empty extent in a full list:\n"
  801. "count = %u, tree depth = %u",
  802. le16_to_cpu(el->l_count),
  803. le16_to_cpu(el->l_tree_depth));
  804. ocfs2_shift_records_right(el);
  805. set_and_inc:
  806. le16_add_cpu(&el->l_next_free_rec, 1);
  807. memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
  808. }
  809. /*
  810. * For a rotation which involves two leaf nodes, the "root node" is
  811. * the lowest level tree node which contains a path to both leafs. This
  812. * resulting set of information can be used to form a complete "subtree"
  813. *
  814. * This function is passed two full paths from the dinode down to a
  815. * pair of adjacent leaves. It's task is to figure out which path
  816. * index contains the subtree root - this can be the root index itself
  817. * in a worst-case rotation.
  818. *
  819. * The array index of the subtree root is passed back.
  820. */
  821. static int ocfs2_find_subtree_root(struct inode *inode,
  822. struct ocfs2_path *left,
  823. struct ocfs2_path *right)
  824. {
  825. int i = 0;
  826. /*
  827. * Check that the caller passed in two paths from the same tree.
  828. */
  829. BUG_ON(path_root_bh(left) != path_root_bh(right));
  830. do {
  831. i++;
  832. /*
  833. * The caller didn't pass two adjacent paths.
  834. */
  835. mlog_bug_on_msg(i > left->p_tree_depth,
  836. "Inode %lu, left depth %u, right depth %u\n"
  837. "left leaf blk %llu, right leaf blk %llu\n",
  838. inode->i_ino, left->p_tree_depth,
  839. right->p_tree_depth,
  840. (unsigned long long)path_leaf_bh(left)->b_blocknr,
  841. (unsigned long long)path_leaf_bh(right)->b_blocknr);
  842. } while (left->p_node[i].bh->b_blocknr ==
  843. right->p_node[i].bh->b_blocknr);
  844. return i - 1;
  845. }
  846. typedef void (path_insert_t)(void *, struct buffer_head *);
  847. /*
  848. * Traverse a btree path in search of cpos, starting at root_el.
  849. *
  850. * This code can be called with a cpos larger than the tree, in which
  851. * case it will return the rightmost path.
  852. */
  853. static int __ocfs2_find_path(struct inode *inode,
  854. struct ocfs2_extent_list *root_el, u32 cpos,
  855. path_insert_t *func, void *data)
  856. {
  857. int i, ret = 0;
  858. u32 range;
  859. u64 blkno;
  860. struct buffer_head *bh = NULL;
  861. struct ocfs2_extent_block *eb;
  862. struct ocfs2_extent_list *el;
  863. struct ocfs2_extent_rec *rec;
  864. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  865. el = root_el;
  866. while (el->l_tree_depth) {
  867. if (le16_to_cpu(el->l_next_free_rec) == 0) {
  868. ocfs2_error(inode->i_sb,
  869. "Inode %llu has empty extent list at "
  870. "depth %u\n",
  871. (unsigned long long)oi->ip_blkno,
  872. le16_to_cpu(el->l_tree_depth));
  873. ret = -EROFS;
  874. goto out;
  875. }
  876. for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
  877. rec = &el->l_recs[i];
  878. /*
  879. * In the case that cpos is off the allocation
  880. * tree, this should just wind up returning the
  881. * rightmost record.
  882. */
  883. range = le32_to_cpu(rec->e_cpos) +
  884. le32_to_cpu(rec->e_clusters);
  885. if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
  886. break;
  887. }
  888. blkno = le64_to_cpu(el->l_recs[i].e_blkno);
  889. if (blkno == 0) {
  890. ocfs2_error(inode->i_sb,
  891. "Inode %llu has bad blkno in extent list "
  892. "at depth %u (index %d)\n",
  893. (unsigned long long)oi->ip_blkno,
  894. le16_to_cpu(el->l_tree_depth), i);
  895. ret = -EROFS;
  896. goto out;
  897. }
  898. brelse(bh);
  899. bh = NULL;
  900. ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
  901. &bh, OCFS2_BH_CACHED, inode);
  902. if (ret) {
  903. mlog_errno(ret);
  904. goto out;
  905. }
  906. eb = (struct ocfs2_extent_block *) bh->b_data;
  907. el = &eb->h_list;
  908. if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
  909. OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
  910. ret = -EIO;
  911. goto out;
  912. }
  913. if (le16_to_cpu(el->l_next_free_rec) >
  914. le16_to_cpu(el->l_count)) {
  915. ocfs2_error(inode->i_sb,
  916. "Inode %llu has bad count in extent list "
  917. "at block %llu (next free=%u, count=%u)\n",
  918. (unsigned long long)oi->ip_blkno,
  919. (unsigned long long)bh->b_blocknr,
  920. le16_to_cpu(el->l_next_free_rec),
  921. le16_to_cpu(el->l_count));
  922. ret = -EROFS;
  923. goto out;
  924. }
  925. if (func)
  926. func(data, bh);
  927. }
  928. out:
  929. /*
  930. * Catch any trailing bh that the loop didn't handle.
  931. */
  932. brelse(bh);
  933. return ret;
  934. }
  935. /*
  936. * Given an initialized path (that is, it has a valid root extent
  937. * list), this function will traverse the btree in search of the path
  938. * which would contain cpos.
  939. *
  940. * The path traveled is recorded in the path structure.
  941. *
  942. * Note that this will not do any comparisons on leaf node extent
  943. * records, so it will work fine in the case that we just added a tree
  944. * branch.
  945. */
  946. struct find_path_data {
  947. int index;
  948. struct ocfs2_path *path;
  949. };
  950. static void find_path_ins(void *data, struct buffer_head *bh)
  951. {
  952. struct find_path_data *fp = data;
  953. get_bh(bh);
  954. ocfs2_path_insert_eb(fp->path, fp->index, bh);
  955. fp->index++;
  956. }
  957. static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
  958. u32 cpos)
  959. {
  960. struct find_path_data data;
  961. data.index = 1;
  962. data.path = path;
  963. return __ocfs2_find_path(inode, path_root_el(path), cpos,
  964. find_path_ins, &data);
  965. }
  966. static void find_leaf_ins(void *data, struct buffer_head *bh)
  967. {
  968. struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
  969. struct ocfs2_extent_list *el = &eb->h_list;
  970. struct buffer_head **ret = data;
  971. /* We want to retain only the leaf block. */
  972. if (le16_to_cpu(el->l_tree_depth) == 0) {
  973. get_bh(bh);
  974. *ret = bh;
  975. }
  976. }
  977. /*
  978. * Find the leaf block in the tree which would contain cpos. No
  979. * checking of the actual leaf is done.
  980. *
  981. * Some paths want to call this instead of allocating a path structure
  982. * and calling ocfs2_find_path().
  983. *
  984. * This function doesn't handle non btree extent lists.
  985. */
  986. int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
  987. u32 cpos, struct buffer_head **leaf_bh)
  988. {
  989. int ret;
  990. struct buffer_head *bh = NULL;
  991. ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
  992. if (ret) {
  993. mlog_errno(ret);
  994. goto out;
  995. }
  996. *leaf_bh = bh;
  997. out:
  998. return ret;
  999. }
  1000. /*
  1001. * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
  1002. *
  1003. * Basically, we've moved stuff around at the bottom of the tree and
  1004. * we need to fix up the extent records above the changes to reflect
  1005. * the new changes.
  1006. *
  1007. * left_rec: the record on the left.
  1008. * left_child_el: is the child list pointed to by left_rec
  1009. * right_rec: the record to the right of left_rec
  1010. * right_child_el: is the child list pointed to by right_rec
  1011. *
  1012. * By definition, this only works on interior nodes.
  1013. */
  1014. static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
  1015. struct ocfs2_extent_list *left_child_el,
  1016. struct ocfs2_extent_rec *right_rec,
  1017. struct ocfs2_extent_list *right_child_el)
  1018. {
  1019. u32 left_clusters, right_end;
  1020. /*
  1021. * Interior nodes never have holes. Their cpos is the cpos of
  1022. * the leftmost record in their child list. Their cluster
  1023. * count covers the full theoretical range of their child list
  1024. * - the range between their cpos and the cpos of the record
  1025. * immediately to their right.
  1026. */
  1027. left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
  1028. left_clusters -= le32_to_cpu(left_rec->e_cpos);
  1029. left_rec->e_clusters = cpu_to_le32(left_clusters);
  1030. /*
  1031. * Calculate the rightmost cluster count boundary before
  1032. * moving cpos - we will need to adjust e_clusters after
  1033. * updating e_cpos to keep the same highest cluster count.
  1034. */
  1035. right_end = le32_to_cpu(right_rec->e_cpos);
  1036. right_end += le32_to_cpu(right_rec->e_clusters);
  1037. right_rec->e_cpos = left_rec->e_cpos;
  1038. le32_add_cpu(&right_rec->e_cpos, left_clusters);
  1039. right_end -= le32_to_cpu(right_rec->e_cpos);
  1040. right_rec->e_clusters = cpu_to_le32(right_end);
  1041. }
  1042. /*
  1043. * Adjust the adjacent root node records involved in a
  1044. * rotation. left_el_blkno is passed in as a key so that we can easily
  1045. * find it's index in the root list.
  1046. */
  1047. static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
  1048. struct ocfs2_extent_list *left_el,
  1049. struct ocfs2_extent_list *right_el,
  1050. u64 left_el_blkno)
  1051. {
  1052. int i;
  1053. BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
  1054. le16_to_cpu(left_el->l_tree_depth));
  1055. for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
  1056. if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
  1057. break;
  1058. }
  1059. /*
  1060. * The path walking code should have never returned a root and
  1061. * two paths which are not adjacent.
  1062. */
  1063. BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
  1064. ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
  1065. &root_el->l_recs[i + 1], right_el);
  1066. }
  1067. /*
  1068. * We've changed a leaf block (in right_path) and need to reflect that
  1069. * change back up the subtree.
  1070. *
  1071. * This happens in multiple places:
  1072. * - When we've moved an extent record from the left path leaf to the right
  1073. * path leaf to make room for an empty extent in the left path leaf.
  1074. * - When our insert into the right path leaf is at the leftmost edge
  1075. * and requires an update of the path immediately to it's left. This
  1076. * can occur at the end of some types of rotation and appending inserts.
  1077. */
  1078. static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
  1079. struct ocfs2_path *left_path,
  1080. struct ocfs2_path *right_path,
  1081. int subtree_index)
  1082. {
  1083. int ret, i, idx;
  1084. struct ocfs2_extent_list *el, *left_el, *right_el;
  1085. struct ocfs2_extent_rec *left_rec, *right_rec;
  1086. struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
  1087. /*
  1088. * Update the counts and position values within all the
  1089. * interior nodes to reflect the leaf rotation we just did.
  1090. *
  1091. * The root node is handled below the loop.
  1092. *
  1093. * We begin the loop with right_el and left_el pointing to the
  1094. * leaf lists and work our way up.
  1095. *
  1096. * NOTE: within this loop, left_el and right_el always refer
  1097. * to the *child* lists.
  1098. */
  1099. left_el = path_leaf_el(left_path);
  1100. right_el = path_leaf_el(right_path);
  1101. for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
  1102. mlog(0, "Adjust records at index %u\n", i);
  1103. /*
  1104. * One nice property of knowing that all of these
  1105. * nodes are below the root is that we only deal with
  1106. * the leftmost right node record and the rightmost
  1107. * left node record.
  1108. */
  1109. el = left_path->p_node[i].el;
  1110. idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
  1111. left_rec = &el->l_recs[idx];
  1112. el = right_path->p_node[i].el;
  1113. right_rec = &el->l_recs[0];
  1114. ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
  1115. right_el);
  1116. ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
  1117. if (ret)
  1118. mlog_errno(ret);
  1119. ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
  1120. if (ret)
  1121. mlog_errno(ret);
  1122. /*
  1123. * Setup our list pointers now so that the current
  1124. * parents become children in the next iteration.
  1125. */
  1126. left_el = left_path->p_node[i].el;
  1127. right_el = right_path->p_node[i].el;
  1128. }
  1129. /*
  1130. * At the root node, adjust the two adjacent records which
  1131. * begin our path to the leaves.
  1132. */
  1133. el = left_path->p_node[subtree_index].el;
  1134. left_el = left_path->p_node[subtree_index + 1].el;
  1135. right_el = right_path->p_node[subtree_index + 1].el;
  1136. ocfs2_adjust_root_records(el, left_el, right_el,
  1137. left_path->p_node[subtree_index + 1].bh->b_blocknr);
  1138. root_bh = left_path->p_node[subtree_index].bh;
  1139. ret = ocfs2_journal_dirty(handle, root_bh);
  1140. if (ret)
  1141. mlog_errno(ret);
  1142. }
  1143. static int ocfs2_rotate_subtree_right(struct inode *inode,
  1144. handle_t *handle,
  1145. struct ocfs2_path *left_path,
  1146. struct ocfs2_path *right_path,
  1147. int subtree_index)
  1148. {
  1149. int ret, i;
  1150. struct buffer_head *right_leaf_bh;
  1151. struct buffer_head *left_leaf_bh = NULL;
  1152. struct buffer_head *root_bh;
  1153. struct ocfs2_extent_list *right_el, *left_el;
  1154. struct ocfs2_extent_rec move_rec;
  1155. left_leaf_bh = path_leaf_bh(left_path);
  1156. left_el = path_leaf_el(left_path);
  1157. if (left_el->l_next_free_rec != left_el->l_count) {
  1158. ocfs2_error(inode->i_sb,
  1159. "Inode %llu has non-full interior leaf node %llu"
  1160. "(next free = %u)",
  1161. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  1162. (unsigned long long)left_leaf_bh->b_blocknr,
  1163. le16_to_cpu(left_el->l_next_free_rec));
  1164. return -EROFS;
  1165. }
  1166. /*
  1167. * This extent block may already have an empty record, so we
  1168. * return early if so.
  1169. */
  1170. if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
  1171. return 0;
  1172. root_bh = left_path->p_node[subtree_index].bh;
  1173. BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
  1174. ret = ocfs2_journal_access(handle, inode, root_bh,
  1175. OCFS2_JOURNAL_ACCESS_WRITE);
  1176. if (ret) {
  1177. mlog_errno(ret);
  1178. goto out;
  1179. }
  1180. for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
  1181. ret = ocfs2_journal_access(handle, inode,
  1182. right_path->p_node[i].bh,
  1183. OCFS2_JOURNAL_ACCESS_WRITE);
  1184. if (ret) {
  1185. mlog_errno(ret);
  1186. goto out;
  1187. }
  1188. ret = ocfs2_journal_access(handle, inode,
  1189. left_path->p_node[i].bh,
  1190. OCFS2_JOURNAL_ACCESS_WRITE);
  1191. if (ret) {
  1192. mlog_errno(ret);
  1193. goto out;
  1194. }
  1195. }
  1196. right_leaf_bh = path_leaf_bh(right_path);
  1197. right_el = path_leaf_el(right_path);
  1198. /* This is a code error, not a disk corruption. */
  1199. mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
  1200. "because rightmost leaf block %llu is empty\n",
  1201. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  1202. (unsigned long long)right_leaf_bh->b_blocknr);
  1203. ocfs2_create_empty_extent(right_el);
  1204. ret = ocfs2_journal_dirty(handle, right_leaf_bh);
  1205. if (ret) {
  1206. mlog_errno(ret);
  1207. goto out;
  1208. }
  1209. /* Do the copy now. */
  1210. i = le16_to_cpu(left_el->l_next_free_rec) - 1;
  1211. move_rec = left_el->l_recs[i];
  1212. right_el->l_recs[0] = move_rec;
  1213. /*
  1214. * Clear out the record we just copied and shift everything
  1215. * over, leaving an empty extent in the left leaf.
  1216. *
  1217. * We temporarily subtract from next_free_rec so that the
  1218. * shift will lose the tail record (which is now defunct).
  1219. */
  1220. le16_add_cpu(&left_el->l_next_free_rec, -1);
  1221. ocfs2_shift_records_right(left_el);
  1222. memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
  1223. le16_add_cpu(&left_el->l_next_free_rec, 1);
  1224. ret = ocfs2_journal_dirty(handle, left_leaf_bh);
  1225. if (ret) {
  1226. mlog_errno(ret);
  1227. goto out;
  1228. }
  1229. ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
  1230. subtree_index);
  1231. out:
  1232. return ret;
  1233. }
  1234. /*
  1235. * Given a full path, determine what cpos value would return us a path
  1236. * containing the leaf immediately to the left of the current one.
  1237. *
  1238. * Will return zero if the path passed in is already the leftmost path.
  1239. */
  1240. static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
  1241. struct ocfs2_path *path, u32 *cpos)
  1242. {
  1243. int i, j, ret = 0;
  1244. u64 blkno;
  1245. struct ocfs2_extent_list *el;
  1246. *cpos = 0;
  1247. blkno = path_leaf_bh(path)->b_blocknr;
  1248. /* Start at the tree node just above the leaf and work our way up. */
  1249. i = path->p_tree_depth - 1;
  1250. while (i >= 0) {
  1251. el = path->p_node[i].el;
  1252. /*
  1253. * Find the extent record just before the one in our
  1254. * path.
  1255. */
  1256. for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
  1257. if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
  1258. if (j == 0) {
  1259. if (i == 0) {
  1260. /*
  1261. * We've determined that the
  1262. * path specified is already
  1263. * the leftmost one - return a
  1264. * cpos of zero.
  1265. */
  1266. goto out;
  1267. }
  1268. /*
  1269. * The leftmost record points to our
  1270. * leaf - we need to travel up the
  1271. * tree one level.
  1272. */
  1273. goto next_node;
  1274. }
  1275. *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
  1276. *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
  1277. goto out;
  1278. }
  1279. }
  1280. /*
  1281. * If we got here, we never found a valid node where
  1282. * the tree indicated one should be.
  1283. */
  1284. ocfs2_error(sb,
  1285. "Invalid extent tree at extent block %llu\n",
  1286. (unsigned long long)blkno);
  1287. ret = -EROFS;
  1288. goto out;
  1289. next_node:
  1290. blkno = path->p_node[i].bh->b_blocknr;
  1291. i--;
  1292. }
  1293. out:
  1294. return ret;
  1295. }
  1296. static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
  1297. struct ocfs2_path *path)
  1298. {
  1299. int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
  1300. if (handle->h_buffer_credits < credits)
  1301. return ocfs2_extend_trans(handle, credits);
  1302. return 0;
  1303. }
  1304. /*
  1305. * Trap the case where we're inserting into the theoretical range past
  1306. * the _actual_ left leaf range. Otherwise, we'll rotate a record
  1307. * whose cpos is less than ours into the right leaf.
  1308. *
  1309. * It's only necessary to look at the rightmost record of the left
  1310. * leaf because the logic that calls us should ensure that the
  1311. * theoretical ranges in the path components above the leaves are
  1312. * correct.
  1313. */
  1314. static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
  1315. u32 insert_cpos)
  1316. {
  1317. struct ocfs2_extent_list *left_el;
  1318. struct ocfs2_extent_rec *rec;
  1319. int next_free;
  1320. left_el = path_leaf_el(left_path);
  1321. next_free = le16_to_cpu(left_el->l_next_free_rec);
  1322. rec = &left_el->l_recs[next_free - 1];
  1323. if (insert_cpos > le32_to_cpu(rec->e_cpos))
  1324. return 1;
  1325. return 0;
  1326. }
  1327. /*
  1328. * Rotate all the records in a btree right one record, starting at insert_cpos.
  1329. *
  1330. * The path to the rightmost leaf should be passed in.
  1331. *
  1332. * The array is assumed to be large enough to hold an entire path (tree depth).
  1333. *
  1334. * Upon succesful return from this function:
  1335. *
  1336. * - The 'right_path' array will contain a path to the leaf block
  1337. * whose range contains e_cpos.
  1338. * - That leaf block will have a single empty extent in list index 0.
  1339. * - In the case that the rotation requires a post-insert update,
  1340. * *ret_left_path will contain a valid path which can be passed to
  1341. * ocfs2_insert_path().
  1342. */
  1343. static int ocfs2_rotate_tree_right(struct inode *inode,
  1344. handle_t *handle,
  1345. u32 insert_cpos,
  1346. struct ocfs2_path *right_path,
  1347. struct ocfs2_path **ret_left_path)
  1348. {
  1349. int ret, start;
  1350. u32 cpos;
  1351. struct ocfs2_path *left_path = NULL;
  1352. *ret_left_path = NULL;
  1353. left_path = ocfs2_new_path(path_root_bh(right_path),
  1354. path_root_el(right_path));
  1355. if (!left_path) {
  1356. ret = -ENOMEM;
  1357. mlog_errno(ret);
  1358. goto out;
  1359. }
  1360. ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
  1361. if (ret) {
  1362. mlog_errno(ret);
  1363. goto out;
  1364. }
  1365. mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
  1366. /*
  1367. * What we want to do here is:
  1368. *
  1369. * 1) Start with the rightmost path.
  1370. *
  1371. * 2) Determine a path to the leaf block directly to the left
  1372. * of that leaf.
  1373. *
  1374. * 3) Determine the 'subtree root' - the lowest level tree node
  1375. * which contains a path to both leaves.
  1376. *
  1377. * 4) Rotate the subtree.
  1378. *
  1379. * 5) Find the next subtree by considering the left path to be
  1380. * the new right path.
  1381. *
  1382. * The check at the top of this while loop also accepts
  1383. * insert_cpos == cpos because cpos is only a _theoretical_
  1384. * value to get us the left path - insert_cpos might very well
  1385. * be filling that hole.
  1386. *
  1387. * Stop at a cpos of '0' because we either started at the
  1388. * leftmost branch (i.e., a tree with one branch and a
  1389. * rotation inside of it), or we've gone as far as we can in
  1390. * rotating subtrees.
  1391. */
  1392. while (cpos && insert_cpos <= cpos) {
  1393. mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
  1394. insert_cpos, cpos);
  1395. ret = ocfs2_find_path(inode, left_path, cpos);
  1396. if (ret) {
  1397. mlog_errno(ret);
  1398. goto out;
  1399. }
  1400. mlog_bug_on_msg(path_leaf_bh(left_path) ==
  1401. path_leaf_bh(right_path),
  1402. "Inode %lu: error during insert of %u "
  1403. "(left path cpos %u) results in two identical "
  1404. "paths ending at %llu\n",
  1405. inode->i_ino, insert_cpos, cpos,
  1406. (unsigned long long)
  1407. path_leaf_bh(left_path)->b_blocknr);
  1408. if (ocfs2_rotate_requires_path_adjustment(left_path,
  1409. insert_cpos)) {
  1410. mlog(0, "Path adjustment required\n");
  1411. /*
  1412. * We've rotated the tree as much as we
  1413. * should. The rest is up to
  1414. * ocfs2_insert_path() to complete, after the
  1415. * record insertion. We indicate this
  1416. * situation by returning the left path.
  1417. *
  1418. * The reason we don't adjust the records here
  1419. * before the record insert is that an error
  1420. * later might break the rule where a parent
  1421. * record e_cpos will reflect the actual
  1422. * e_cpos of the 1st nonempty record of the
  1423. * child list.
  1424. */
  1425. *ret_left_path = left_path;
  1426. goto out_ret_path;
  1427. }
  1428. start = ocfs2_find_subtree_root(inode, left_path, right_path);
  1429. mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
  1430. start,
  1431. (unsigned long long) right_path->p_node[start].bh->b_blocknr,
  1432. right_path->p_tree_depth);
  1433. ret = ocfs2_extend_rotate_transaction(handle, start,
  1434. right_path);
  1435. if (ret) {
  1436. mlog_errno(ret);
  1437. goto out;
  1438. }
  1439. ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
  1440. right_path, start);
  1441. if (ret) {
  1442. mlog_errno(ret);
  1443. goto out;
  1444. }
  1445. /*
  1446. * There is no need to re-read the next right path
  1447. * as we know that it'll be our current left
  1448. * path. Optimize by copying values instead.
  1449. */
  1450. ocfs2_mv_path(right_path, left_path);
  1451. ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
  1452. &cpos);
  1453. if (ret) {
  1454. mlog_errno(ret);
  1455. goto out;
  1456. }
  1457. }
  1458. out:
  1459. ocfs2_free_path(left_path);
  1460. out_ret_path:
  1461. return ret;
  1462. }
  1463. /*
  1464. * Do the final bits of extent record insertion at the target leaf
  1465. * list. If this leaf is part of an allocation tree, it is assumed
  1466. * that the tree above has been prepared.
  1467. */
  1468. static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
  1469. struct ocfs2_extent_list *el,
  1470. struct ocfs2_insert_type *insert,
  1471. struct inode *inode)
  1472. {
  1473. int i = insert->ins_contig_index;
  1474. unsigned int range;
  1475. struct ocfs2_extent_rec *rec;
  1476. BUG_ON(el->l_tree_depth);
  1477. /*
  1478. * Contiguous insert - either left or right.
  1479. */
  1480. if (insert->ins_contig != CONTIG_NONE) {
  1481. rec = &el->l_recs[i];
  1482. if (insert->ins_contig == CONTIG_LEFT) {
  1483. rec->e_blkno = insert_rec->e_blkno;
  1484. rec->e_cpos = insert_rec->e_cpos;
  1485. }
  1486. le32_add_cpu(&rec->e_clusters,
  1487. le32_to_cpu(insert_rec->e_clusters));
  1488. return;
  1489. }
  1490. /*
  1491. * Handle insert into an empty leaf.
  1492. */
  1493. if (le16_to_cpu(el->l_next_free_rec) == 0 ||
  1494. ((le16_to_cpu(el->l_next_free_rec) == 1) &&
  1495. ocfs2_is_empty_extent(&el->l_recs[0]))) {
  1496. el->l_recs[0] = *insert_rec;
  1497. el->l_next_free_rec = cpu_to_le16(1);
  1498. return;
  1499. }
  1500. /*
  1501. * Appending insert.
  1502. */
  1503. if (insert->ins_appending == APPEND_TAIL) {
  1504. i = le16_to_cpu(el->l_next_free_rec) - 1;
  1505. rec = &el->l_recs[i];
  1506. range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
  1507. BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
  1508. mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
  1509. le16_to_cpu(el->l_count),
  1510. "inode %lu, depth %u, count %u, next free %u, "
  1511. "rec.cpos %u, rec.clusters %u, "
  1512. "insert.cpos %u, insert.clusters %u\n",
  1513. inode->i_ino,
  1514. le16_to_cpu(el->l_tree_depth),
  1515. le16_to_cpu(el->l_count),
  1516. le16_to_cpu(el->l_next_free_rec),
  1517. le32_to_cpu(el->l_recs[i].e_cpos),
  1518. le32_to_cpu(el->l_recs[i].e_clusters),
  1519. le32_to_cpu(insert_rec->e_cpos),
  1520. le32_to_cpu(insert_rec->e_clusters));
  1521. i++;
  1522. el->l_recs[i] = *insert_rec;
  1523. le16_add_cpu(&el->l_next_free_rec, 1);
  1524. return;
  1525. }
  1526. /*
  1527. * Ok, we have to rotate.
  1528. *
  1529. * At this point, it is safe to assume that inserting into an
  1530. * empty leaf and appending to a leaf have both been handled
  1531. * above.
  1532. *
  1533. * This leaf needs to have space, either by the empty 1st
  1534. * extent record, or by virtue of an l_next_rec < l_count.
  1535. */
  1536. ocfs2_rotate_leaf(el, insert_rec);
  1537. }
  1538. static inline void ocfs2_update_dinode_clusters(struct inode *inode,
  1539. struct ocfs2_dinode *di,
  1540. u32 clusters)
  1541. {
  1542. le32_add_cpu(&di->i_clusters, clusters);
  1543. spin_lock(&OCFS2_I(inode)->ip_lock);
  1544. OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
  1545. spin_unlock(&OCFS2_I(inode)->ip_lock);
  1546. }
  1547. static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
  1548. struct ocfs2_extent_rec *insert_rec,
  1549. struct ocfs2_path *right_path,
  1550. struct ocfs2_path **ret_left_path)
  1551. {
  1552. int ret, i, next_free;
  1553. struct buffer_head *bh;
  1554. struct ocfs2_extent_list *el;
  1555. struct ocfs2_path *left_path = NULL;
  1556. *ret_left_path = NULL;
  1557. /*
  1558. * If our appending insert is at the leftmost edge of a leaf,
  1559. * then we might need to update the rightmost records of the
  1560. * neighboring path.
  1561. */
  1562. el = path_leaf_el(right_path);
  1563. next_free = le16_to_cpu(el->l_next_free_rec);
  1564. if (next_free == 0 ||
  1565. (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
  1566. u32 left_cpos;
  1567. ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
  1568. &left_cpos);
  1569. if (ret) {
  1570. mlog_errno(ret);
  1571. goto out;
  1572. }
  1573. mlog(0, "Append may need a left path update. cpos: %u, "
  1574. "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
  1575. left_cpos);
  1576. /*
  1577. * No need to worry if the append is already in the
  1578. * leftmost leaf.
  1579. */
  1580. if (left_cpos) {
  1581. left_path = ocfs2_new_path(path_root_bh(right_path),
  1582. path_root_el(right_path));
  1583. if (!left_path) {
  1584. ret = -ENOMEM;
  1585. mlog_errno(ret);
  1586. goto out;
  1587. }
  1588. ret = ocfs2_find_path(inode, left_path, left_cpos);
  1589. if (ret) {
  1590. mlog_errno(ret);
  1591. goto out;
  1592. }
  1593. /*
  1594. * ocfs2_insert_path() will pass the left_path to the
  1595. * journal for us.
  1596. */
  1597. }
  1598. }
  1599. ret = ocfs2_journal_access_path(inode, handle, right_path);
  1600. if (ret) {
  1601. mlog_errno(ret);
  1602. goto out;
  1603. }
  1604. el = path_root_el(right_path);
  1605. bh = path_root_bh(right_path);
  1606. i = 0;
  1607. while (1) {
  1608. next_free = le16_to_cpu(el->l_next_free_rec);
  1609. if (next_free == 0) {
  1610. ocfs2_error(inode->i_sb,
  1611. "Dinode %llu has a bad extent list",
  1612. (unsigned long long)OCFS2_I(inode)->ip_blkno);
  1613. ret = -EIO;
  1614. goto out;
  1615. }
  1616. el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
  1617. le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
  1618. le32_to_cpu(insert_rec->e_clusters));
  1619. le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
  1620. -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
  1621. ret = ocfs2_journal_dirty(handle, bh);
  1622. if (ret)
  1623. mlog_errno(ret);
  1624. if (++i >= right_path->p_tree_depth)
  1625. break;
  1626. bh = right_path->p_node[i].bh;
  1627. el = right_path->p_node[i].el;
  1628. }
  1629. *ret_left_path = left_path;
  1630. ret = 0;
  1631. out:
  1632. if (ret != 0)
  1633. ocfs2_free_path(left_path);
  1634. return ret;
  1635. }
  1636. /*
  1637. * This function only does inserts on an allocation b-tree. For dinode
  1638. * lists, ocfs2_insert_at_leaf() is called directly.
  1639. *
  1640. * right_path is the path we want to do the actual insert
  1641. * in. left_path should only be passed in if we need to update that
  1642. * portion of the tree after an edge insert.
  1643. */
  1644. static int ocfs2_insert_path(struct inode *inode,
  1645. handle_t *handle,
  1646. struct ocfs2_path *left_path,
  1647. struct ocfs2_path *right_path,
  1648. struct ocfs2_extent_rec *insert_rec,
  1649. struct ocfs2_insert_type *insert)
  1650. {
  1651. int ret, subtree_index;
  1652. struct buffer_head *leaf_bh = path_leaf_bh(right_path);
  1653. struct ocfs2_extent_list *el;
  1654. /*
  1655. * Pass both paths to the journal. The majority of inserts
  1656. * will be touching all components anyway.
  1657. */
  1658. ret = ocfs2_journal_access_path(inode, handle, right_path);
  1659. if (ret < 0) {
  1660. mlog_errno(ret);
  1661. goto out;
  1662. }
  1663. if (left_path) {
  1664. int credits = handle->h_buffer_credits;
  1665. /*
  1666. * There's a chance that left_path got passed back to
  1667. * us without being accounted for in the
  1668. * journal. Extend our transaction here to be sure we
  1669. * can change those blocks.
  1670. */
  1671. credits += left_path->p_tree_depth;
  1672. ret = ocfs2_extend_trans(handle, credits);
  1673. if (ret < 0) {
  1674. mlog_errno(ret);
  1675. goto out;
  1676. }
  1677. ret = ocfs2_journal_access_path(inode, handle, left_path);
  1678. if (ret < 0) {
  1679. mlog_errno(ret);
  1680. goto out;
  1681. }
  1682. }
  1683. el = path_leaf_el(right_path);
  1684. ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
  1685. ret = ocfs2_journal_dirty(handle, leaf_bh);
  1686. if (ret)
  1687. mlog_errno(ret);
  1688. if (left_path) {
  1689. /*
  1690. * The rotate code has indicated that we need to fix
  1691. * up portions of the tree after the insert.
  1692. *
  1693. * XXX: Should we extend the transaction here?
  1694. */
  1695. subtree_index = ocfs2_find_subtree_root(inode, left_path,
  1696. right_path);
  1697. ocfs2_complete_edge_insert(inode, handle, left_path,
  1698. right_path, subtree_index);
  1699. }
  1700. ret = 0;
  1701. out:
  1702. return ret;
  1703. }
  1704. static int ocfs2_do_insert_extent(struct inode *inode,
  1705. handle_t *handle,
  1706. struct buffer_head *di_bh,
  1707. struct ocfs2_extent_rec *insert_rec,
  1708. struct ocfs2_insert_type *type)
  1709. {
  1710. int ret, rotate = 0;
  1711. u32 cpos;
  1712. struct ocfs2_path *right_path = NULL;
  1713. struct ocfs2_path *left_path = NULL;
  1714. struct ocfs2_dinode *di;
  1715. struct ocfs2_extent_list *el;
  1716. di = (struct ocfs2_dinode *) di_bh->b_data;
  1717. el = &di->id2.i_list;
  1718. ret = ocfs2_journal_access(handle, inode, di_bh,
  1719. OCFS2_JOURNAL_ACCESS_WRITE);
  1720. if (ret) {
  1721. mlog_errno(ret);
  1722. goto out;
  1723. }
  1724. if (le16_to_cpu(el->l_tree_depth) == 0) {
  1725. ocfs2_insert_at_leaf(insert_rec, el, type, inode);
  1726. goto out_update_clusters;
  1727. }
  1728. right_path = ocfs2_new_inode_path(di_bh);
  1729. if (!right_path) {
  1730. ret = -ENOMEM;
  1731. mlog_errno(ret);
  1732. goto out;
  1733. }
  1734. /*
  1735. * Determine the path to start with. Rotations need the
  1736. * rightmost path, everything else can go directly to the
  1737. * target leaf.
  1738. */
  1739. cpos = le32_to_cpu(insert_rec->e_cpos);
  1740. if (type->ins_appending == APPEND_NONE &&
  1741. type->ins_contig == CONTIG_NONE) {
  1742. rotate = 1;
  1743. cpos = UINT_MAX;
  1744. }
  1745. ret = ocfs2_find_path(inode, right_path, cpos);
  1746. if (ret) {
  1747. mlog_errno(ret);
  1748. goto out;
  1749. }
  1750. /*
  1751. * Rotations and appends need special treatment - they modify
  1752. * parts of the tree's above them.
  1753. *
  1754. * Both might pass back a path immediate to the left of the
  1755. * one being inserted to. This will be cause
  1756. * ocfs2_insert_path() to modify the rightmost records of
  1757. * left_path to account for an edge insert.
  1758. *
  1759. * XXX: When modifying this code, keep in mind that an insert
  1760. * can wind up skipping both of these two special cases...
  1761. */
  1762. if (rotate) {
  1763. ret = ocfs2_rotate_tree_right(inode, handle,
  1764. le32_to_cpu(insert_rec->e_cpos),
  1765. right_path, &left_path);
  1766. if (ret) {
  1767. mlog_errno(ret);
  1768. goto out;
  1769. }
  1770. } else if (type->ins_appending == APPEND_TAIL
  1771. && type->ins_contig != CONTIG_LEFT) {
  1772. ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
  1773. right_path, &left_path);
  1774. if (ret) {
  1775. mlog_errno(ret);
  1776. goto out;
  1777. }
  1778. }
  1779. ret = ocfs2_insert_path(inode, handle, left_path, right_path,
  1780. insert_rec, type);
  1781. if (ret) {
  1782. mlog_errno(ret);
  1783. goto out;
  1784. }
  1785. out_update_clusters:
  1786. ocfs2_update_dinode_clusters(inode, di,
  1787. le32_to_cpu(insert_rec->e_clusters));
  1788. ret = ocfs2_journal_dirty(handle, di_bh);
  1789. if (ret)
  1790. mlog_errno(ret);
  1791. out:
  1792. ocfs2_free_path(left_path);
  1793. ocfs2_free_path(right_path);
  1794. return ret;
  1795. }
  1796. static void ocfs2_figure_contig_type(struct inode *inode,
  1797. struct ocfs2_insert_type *insert,
  1798. struct ocfs2_extent_list *el,
  1799. struct ocfs2_extent_rec *insert_rec)
  1800. {
  1801. int i;
  1802. enum ocfs2_contig_type contig_type = CONTIG_NONE;
  1803. for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
  1804. contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
  1805. insert_rec);
  1806. if (contig_type != CONTIG_NONE) {
  1807. insert->ins_contig_index = i;
  1808. break;
  1809. }
  1810. }
  1811. insert->ins_contig = contig_type;
  1812. }
  1813. /*
  1814. * This should only be called against the righmost leaf extent list.
  1815. *
  1816. * ocfs2_figure_appending_type() will figure out whether we'll have to
  1817. * insert at the tail of the rightmost leaf.
  1818. *
  1819. * This should also work against the dinode list for tree's with 0
  1820. * depth. If we consider the dinode list to be the rightmost leaf node
  1821. * then the logic here makes sense.
  1822. */
  1823. static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
  1824. struct ocfs2_extent_list *el,
  1825. struct ocfs2_extent_rec *insert_rec)
  1826. {
  1827. int i;
  1828. u32 cpos = le32_to_cpu(insert_rec->e_cpos);
  1829. struct ocfs2_extent_rec *rec;
  1830. insert->ins_appending = APPEND_NONE;
  1831. BUG_ON(el->l_tree_depth);
  1832. if (!el->l_next_free_rec)
  1833. goto set_tail_append;
  1834. if (ocfs2_is_empty_extent(&el->l_recs[0])) {
  1835. /* Were all records empty? */
  1836. if (le16_to_cpu(el->l_next_free_rec) == 1)
  1837. goto set_tail_append;
  1838. }
  1839. i = le16_to_cpu(el->l_next_free_rec) - 1;
  1840. rec = &el->l_recs[i];
  1841. if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
  1842. goto set_tail_append;
  1843. return;
  1844. set_tail_append:
  1845. insert->ins_appending = APPEND_TAIL;
  1846. }
  1847. /*
  1848. * Helper function called at the begining of an insert.
  1849. *
  1850. * This computes a few things that are commonly used in the process of
  1851. * inserting into the btree:
  1852. * - Whether the new extent is contiguous with an existing one.
  1853. * - The current tree depth.
  1854. * - Whether the insert is an appending one.
  1855. * - The total # of free records in the tree.
  1856. *
  1857. * All of the information is stored on the ocfs2_insert_type
  1858. * structure.
  1859. */
  1860. static int ocfs2_figure_insert_type(struct inode *inode,
  1861. struct buffer_head *di_bh,
  1862. struct buffer_head **last_eb_bh,
  1863. struct ocfs2_extent_rec *insert_rec,
  1864. struct ocfs2_insert_type *insert)
  1865. {
  1866. int ret;
  1867. struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
  1868. struct ocfs2_extent_block *eb;
  1869. struct ocfs2_extent_list *el;
  1870. struct ocfs2_path *path = NULL;
  1871. struct buffer_head *bh = NULL;
  1872. el = &di->id2.i_list;
  1873. insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
  1874. if (el->l_tree_depth) {
  1875. /*
  1876. * If we have tree depth, we read in the
  1877. * rightmost extent block ahead of time as
  1878. * ocfs2_figure_insert_type() and ocfs2_add_branch()
  1879. * may want it later.
  1880. */
  1881. ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
  1882. le64_to_cpu(di->i_last_eb_blk), &bh,
  1883. OCFS2_BH_CACHED, inode);
  1884. if (ret) {
  1885. mlog_exit(ret);
  1886. goto out;
  1887. }
  1888. eb = (struct ocfs2_extent_block *) bh->b_data;
  1889. el = &eb->h_list;
  1890. }
  1891. /*
  1892. * Unless we have a contiguous insert, we'll need to know if
  1893. * there is room left in our allocation tree for another
  1894. * extent record.
  1895. *
  1896. * XXX: This test is simplistic, we can search for empty
  1897. * extent records too.
  1898. */
  1899. insert->ins_free_records = le16_to_cpu(el->l_count) -
  1900. le16_to_cpu(el->l_next_free_rec);
  1901. if (!insert->ins_tree_depth) {
  1902. ocfs2_figure_contig_type(inode, insert, el, insert_rec);
  1903. ocfs2_figure_appending_type(insert, el, insert_rec);
  1904. return 0;
  1905. }
  1906. path = ocfs2_new_inode_path(di_bh);
  1907. if (!path) {
  1908. ret = -ENOMEM;
  1909. mlog_errno(ret);
  1910. goto out;
  1911. }
  1912. /*
  1913. * In the case that we're inserting past what the tree
  1914. * currently accounts for, ocfs2_find_path() will return for
  1915. * us the rightmost tree path. This is accounted for below in
  1916. * the appending code.
  1917. */
  1918. ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
  1919. if (ret) {
  1920. mlog_errno(ret);
  1921. goto out;
  1922. }
  1923. el = path_leaf_el(path);
  1924. /*
  1925. * Now that we have the path, there's two things we want to determine:
  1926. * 1) Contiguousness (also set contig_index if this is so)
  1927. *
  1928. * 2) Are we doing an append? We can trivially break this up
  1929. * into two types of appends: simple record append, or a
  1930. * rotate inside the tail leaf.
  1931. */
  1932. ocfs2_figure_contig_type(inode, insert, el, insert_rec);
  1933. /*
  1934. * The insert code isn't quite ready to deal with all cases of
  1935. * left contiguousness. Specifically, if it's an insert into
  1936. * the 1st record in a leaf, it will require the adjustment of
  1937. * e_clusters on the last record of the path directly to it's
  1938. * left. For now, just catch that case and fool the layers
  1939. * above us. This works just fine for tree_depth == 0, which
  1940. * is why we allow that above.
  1941. */
  1942. if (insert->ins_contig == CONTIG_LEFT &&
  1943. insert->ins_contig_index == 0)
  1944. insert->ins_contig = CONTIG_NONE;
  1945. /*
  1946. * Ok, so we can simply compare against last_eb to figure out
  1947. * whether the path doesn't exist. This will only happen in
  1948. * the case that we're doing a tail append, so maybe we can
  1949. * take advantage of that information somehow.
  1950. */
  1951. if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
  1952. /*
  1953. * Ok, ocfs2_find_path() returned us the rightmost
  1954. * tree path. This might be an appending insert. There are
  1955. * two cases:
  1956. * 1) We're doing a true append at the tail:
  1957. * -This might even be off the end of the leaf
  1958. * 2) We're "appending" by rotating in the tail
  1959. */
  1960. ocfs2_figure_appending_type(insert, el, insert_rec);
  1961. }
  1962. out:
  1963. ocfs2_free_path(path);
  1964. if (ret == 0)
  1965. *last_eb_bh = bh;
  1966. else
  1967. brelse(bh);
  1968. return ret;
  1969. }
  1970. /*
  1971. * Insert an extent into an inode btree.
  1972. *
  1973. * The caller needs to update fe->i_clusters
  1974. */
  1975. int ocfs2_insert_extent(struct ocfs2_super *osb,
  1976. handle_t *handle,
  1977. struct inode *inode,
  1978. struct buffer_head *fe_bh,
  1979. u32 cpos,
  1980. u64 start_blk,
  1981. u32 new_clusters,
  1982. struct ocfs2_alloc_context *meta_ac)
  1983. {
  1984. int status, shift;
  1985. struct buffer_head *last_eb_bh = NULL;
  1986. struct buffer_head *bh = NULL;
  1987. struct ocfs2_insert_type insert = {0, };
  1988. struct ocfs2_extent_rec rec;
  1989. mlog(0, "add %u clusters at position %u to inode %llu\n",
  1990. new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
  1991. mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
  1992. (OCFS2_I(inode)->ip_clusters != cpos),
  1993. "Device %s, asking for sparse allocation: inode %llu, "
  1994. "cpos %u, clusters %u\n",
  1995. osb->dev_str,
  1996. (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
  1997. OCFS2_I(inode)->ip_clusters);
  1998. rec.e_cpos = cpu_to_le32(cpos);
  1999. rec.e_blkno = cpu_to_le64(start_blk);
  2000. rec.e_clusters = cpu_to_le32(new_clusters);
  2001. status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
  2002. &insert);
  2003. if (status < 0) {
  2004. mlog_errno(status);
  2005. goto bail;
  2006. }
  2007. mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
  2008. "Insert.contig_index: %d, Insert.free_records: %d, "
  2009. "Insert.tree_depth: %d\n",
  2010. insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
  2011. insert.ins_free_records, insert.ins_tree_depth);
  2012. /*
  2013. * Avoid growing the tree unless we're out of records and the
  2014. * insert type requres one.
  2015. */
  2016. if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
  2017. goto out_add;
  2018. shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
  2019. if (shift < 0) {
  2020. status = shift;
  2021. mlog_errno(status);
  2022. goto bail;
  2023. }
  2024. /* We traveled all the way to the bottom of the allocation tree
  2025. * and didn't find room for any more extents - we need to add
  2026. * another tree level */
  2027. if (shift) {
  2028. BUG_ON(bh);
  2029. mlog(0, "need to shift tree depth "
  2030. "(current = %d)\n", insert.ins_tree_depth);
  2031. /* ocfs2_shift_tree_depth will return us a buffer with
  2032. * the new extent block (so we can pass that to
  2033. * ocfs2_add_branch). */
  2034. status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
  2035. meta_ac, &bh);
  2036. if (status < 0) {
  2037. mlog_errno(status);
  2038. goto bail;
  2039. }
  2040. insert.ins_tree_depth++;
  2041. /* Special case: we have room now if we shifted from
  2042. * tree_depth 0 */
  2043. if (insert.ins_tree_depth == 1)
  2044. goto out_add;
  2045. }
  2046. /* call ocfs2_add_branch to add the final part of the tree with
  2047. * the new data. */
  2048. mlog(0, "add branch. bh = %p\n", bh);
  2049. status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
  2050. meta_ac);
  2051. if (status < 0) {
  2052. mlog_errno(status);
  2053. goto bail;
  2054. }
  2055. out_add:
  2056. /* Finally, we can add clusters. This might rotate the tree for us. */
  2057. status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
  2058. if (status < 0)
  2059. mlog_errno(status);
  2060. bail:
  2061. if (bh)
  2062. brelse(bh);
  2063. if (last_eb_bh)
  2064. brelse(last_eb_bh);
  2065. mlog_exit(status);
  2066. return status;
  2067. }
  2068. static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
  2069. {
  2070. struct buffer_head *tl_bh = osb->osb_tl_bh;
  2071. struct ocfs2_dinode *di;
  2072. struct ocfs2_truncate_log *tl;
  2073. di = (struct ocfs2_dinode *) tl_bh->b_data;
  2074. tl = &di->id2.i_dealloc;
  2075. mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
  2076. "slot %d, invalid truncate log parameters: used = "
  2077. "%u, count = %u\n", osb->slot_num,
  2078. le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
  2079. return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
  2080. }
  2081. static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
  2082. unsigned int new_start)
  2083. {
  2084. unsigned int tail_index;
  2085. unsigned int current_tail;
  2086. /* No records, nothing to coalesce */
  2087. if (!le16_to_cpu(tl->tl_used))
  2088. return 0;
  2089. tail_index = le16_to_cpu(tl->tl_used) - 1;
  2090. current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
  2091. current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
  2092. return current_tail == new_start;
  2093. }
  2094. static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
  2095. handle_t *handle,
  2096. u64 start_blk,
  2097. unsigned int num_clusters)
  2098. {
  2099. int status, index;
  2100. unsigned int start_cluster, tl_count;
  2101. struct inode *tl_inode = osb->osb_tl_inode;
  2102. struct buffer_head *tl_bh = osb->osb_tl_bh;
  2103. struct ocfs2_dinode *di;
  2104. struct ocfs2_truncate_log *tl;
  2105. mlog_entry("start_blk = %llu, num_clusters = %u\n",
  2106. (unsigned long long)start_blk, num_clusters);
  2107. BUG_ON(mutex_trylock(&tl_inode->i_mutex));
  2108. start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
  2109. di = (struct ocfs2_dinode *) tl_bh->b_data;
  2110. tl = &di->id2.i_dealloc;
  2111. if (!OCFS2_IS_VALID_DINODE(di)) {
  2112. OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
  2113. status = -EIO;
  2114. goto bail;
  2115. }
  2116. tl_count = le16_to_cpu(tl->tl_count);
  2117. mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
  2118. tl_count == 0,
  2119. "Truncate record count on #%llu invalid "
  2120. "wanted %u, actual %u\n",
  2121. (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
  2122. ocfs2_truncate_recs_per_inode(osb->sb),
  2123. le16_to_cpu(tl->tl_count));
  2124. /* Caller should have known to flush before calling us. */
  2125. index = le16_to_cpu(tl->tl_used);
  2126. if (index >= tl_count) {
  2127. status = -ENOSPC;
  2128. mlog_errno(status);
  2129. goto bail;
  2130. }
  2131. status = ocfs2_journal_access(handle, tl_inode, tl_bh,
  2132. OCFS2_JOURNAL_ACCESS_WRITE);
  2133. if (status < 0) {
  2134. mlog_errno(status);
  2135. goto bail;
  2136. }
  2137. mlog(0, "Log truncate of %u clusters starting at cluster %u to "
  2138. "%llu (index = %d)\n", num_clusters, start_cluster,
  2139. (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
  2140. if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
  2141. /*
  2142. * Move index back to the record we are coalescing with.
  2143. * ocfs2_truncate_log_can_coalesce() guarantees nonzero
  2144. */
  2145. index--;
  2146. num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
  2147. mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
  2148. index, le32_to_cpu(tl->tl_recs[index].t_start),
  2149. num_clusters);
  2150. } else {
  2151. tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
  2152. tl->tl_used = cpu_to_le16(index + 1);
  2153. }
  2154. tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
  2155. status = ocfs2_journal_dirty(handle, tl_bh);
  2156. if (status < 0) {
  2157. mlog_errno(status);
  2158. goto bail;
  2159. }
  2160. bail:
  2161. mlog_exit(status);
  2162. return status;
  2163. }
  2164. static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
  2165. handle_t *handle,
  2166. struct inode *data_alloc_inode,
  2167. struct buffer_head *data_alloc_bh)
  2168. {
  2169. int status = 0;
  2170. int i;
  2171. unsigned int num_clusters;
  2172. u64 start_blk;
  2173. struct ocfs2_truncate_rec rec;
  2174. struct ocfs2_dinode *di;
  2175. struct ocfs2_truncate_log *tl;
  2176. struct inode *tl_inode = osb->osb_tl_inode;
  2177. struct buffer_head *tl_bh = osb->osb_tl_bh;
  2178. mlog_entry_void();
  2179. di = (struct ocfs2_dinode *) tl_bh->b_data;
  2180. tl = &di->id2.i_dealloc;
  2181. i = le16_to_cpu(tl->tl_used) - 1;
  2182. while (i >= 0) {
  2183. /* Caller has given us at least enough credits to
  2184. * update the truncate log dinode */
  2185. status = ocfs2_journal_access(handle, tl_inode, tl_bh,
  2186. OCFS2_JOURNAL_ACCESS_WRITE);
  2187. if (status < 0) {
  2188. mlog_errno(status);
  2189. goto bail;
  2190. }
  2191. tl->tl_used = cpu_to_le16(i);
  2192. status = ocfs2_journal_dirty(handle, tl_bh);
  2193. if (status < 0) {
  2194. mlog_errno(status);
  2195. goto bail;
  2196. }
  2197. /* TODO: Perhaps we can calculate the bulk of the
  2198. * credits up front rather than extending like
  2199. * this. */
  2200. status = ocfs2_extend_trans(handle,
  2201. OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
  2202. if (status < 0) {
  2203. mlog_errno(status);
  2204. goto bail;
  2205. }
  2206. rec = tl->tl_recs[i];
  2207. start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
  2208. le32_to_cpu(rec.t_start));
  2209. num_clusters = le32_to_cpu(rec.t_clusters);
  2210. /* if start_blk is not set, we ignore the record as
  2211. * invalid. */
  2212. if (start_blk) {
  2213. mlog(0, "free record %d, start = %u, clusters = %u\n",
  2214. i, le32_to_cpu(rec.t_start), num_clusters);
  2215. status = ocfs2_free_clusters(handle, data_alloc_inode,
  2216. data_alloc_bh, start_blk,
  2217. num_clusters);
  2218. if (status < 0) {
  2219. mlog_errno(status);
  2220. goto bail;
  2221. }
  2222. }
  2223. i--;
  2224. }
  2225. bail:
  2226. mlog_exit(status);
  2227. return status;
  2228. }
  2229. /* Expects you to already be holding tl_inode->i_mutex */
  2230. static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
  2231. {
  2232. int status;
  2233. unsigned int num_to_flush;
  2234. handle_t *handle;
  2235. struct inode *tl_inode = osb->osb_tl_inode;
  2236. struct inode *data_alloc_inode = NULL;
  2237. struct buffer_head *tl_bh = osb->osb_tl_bh;
  2238. struct buffer_head *data_alloc_bh = NULL;
  2239. struct ocfs2_dinode *di;
  2240. struct ocfs2_truncate_log *tl;
  2241. mlog_entry_void();
  2242. BUG_ON(mutex_trylock(&tl_inode->i_mutex));
  2243. di = (struct ocfs2_dinode *) tl_bh->b_data;
  2244. tl = &di->id2.i_dealloc;
  2245. if (!OCFS2_IS_VALID_DINODE(di)) {
  2246. OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
  2247. status = -EIO;
  2248. goto out;
  2249. }
  2250. num_to_flush = le16_to_cpu(tl->tl_used);
  2251. mlog(0, "Flush %u records from truncate log #%llu\n",
  2252. num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
  2253. if (!num_to_flush) {
  2254. status = 0;
  2255. goto out;
  2256. }
  2257. data_alloc_inode = ocfs2_get_system_file_inode(osb,
  2258. GLOBAL_BITMAP_SYSTEM_INODE,
  2259. OCFS2_INVALID_SLOT);
  2260. if (!data_alloc_inode) {
  2261. status = -EINVAL;
  2262. mlog(ML_ERROR, "Could not get bitmap inode!\n");
  2263. goto out;
  2264. }
  2265. mutex_lock(&data_alloc_inode->i_mutex);
  2266. status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
  2267. if (status < 0) {
  2268. mlog_errno(status);
  2269. goto out_mutex;
  2270. }
  2271. handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
  2272. if (IS_ERR(handle)) {
  2273. status = PTR_ERR(handle);
  2274. mlog_errno(status);
  2275. goto out_unlock;
  2276. }
  2277. status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
  2278. data_alloc_bh);
  2279. if (status < 0)
  2280. mlog_errno(status);
  2281. ocfs2_commit_trans(osb, handle);
  2282. out_unlock:
  2283. brelse(data_alloc_bh);
  2284. ocfs2_meta_unlock(data_alloc_inode, 1);
  2285. out_mutex:
  2286. mutex_unlock(&data_alloc_inode->i_mutex);
  2287. iput(data_alloc_inode);
  2288. out:
  2289. mlog_exit(status);
  2290. return status;
  2291. }
  2292. int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
  2293. {
  2294. int status;
  2295. struct inode *tl_inode = osb->osb_tl_inode;
  2296. mutex_lock(&tl_inode->i_mutex);
  2297. status = __ocfs2_flush_truncate_log(osb);
  2298. mutex_unlock(&tl_inode->i_mutex);
  2299. return status;
  2300. }
  2301. static void ocfs2_truncate_log_worker(struct work_struct *work)
  2302. {
  2303. int status;
  2304. struct ocfs2_super *osb =
  2305. container_of(work, struct ocfs2_super,
  2306. osb_truncate_log_wq.work);
  2307. mlog_entry_void();
  2308. status = ocfs2_flush_truncate_log(osb);
  2309. if (status < 0)
  2310. mlog_errno(status);
  2311. mlog_exit(status);
  2312. }
  2313. #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
  2314. void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
  2315. int cancel)
  2316. {
  2317. if (osb->osb_tl_inode) {
  2318. /* We want to push off log flushes while truncates are
  2319. * still running. */
  2320. if (cancel)
  2321. cancel_delayed_work(&osb->osb_truncate_log_wq);
  2322. queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
  2323. OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
  2324. }
  2325. }
  2326. static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
  2327. int slot_num,
  2328. struct inode **tl_inode,
  2329. struct buffer_head **tl_bh)
  2330. {
  2331. int status;
  2332. struct inode *inode = NULL;
  2333. struct buffer_head *bh = NULL;
  2334. inode = ocfs2_get_system_file_inode(osb,
  2335. TRUNCATE_LOG_SYSTEM_INODE,
  2336. slot_num);
  2337. if (!inode) {
  2338. status = -EINVAL;
  2339. mlog(ML_ERROR, "Could not get load truncate log inode!\n");
  2340. goto bail;
  2341. }
  2342. status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
  2343. OCFS2_BH_CACHED, inode);
  2344. if (status < 0) {
  2345. iput(inode);
  2346. mlog_errno(status);
  2347. goto bail;
  2348. }
  2349. *tl_inode = inode;
  2350. *tl_bh = bh;
  2351. bail:
  2352. mlog_exit(status);
  2353. return status;
  2354. }
  2355. /* called during the 1st stage of node recovery. we stamp a clean
  2356. * truncate log and pass back a copy for processing later. if the
  2357. * truncate log does not require processing, a *tl_copy is set to
  2358. * NULL. */
  2359. int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
  2360. int slot_num,
  2361. struct ocfs2_dinode **tl_copy)
  2362. {
  2363. int status;
  2364. struct inode *tl_inode = NULL;
  2365. struct buffer_head *tl_bh = NULL;
  2366. struct ocfs2_dinode *di;
  2367. struct ocfs2_truncate_log *tl;
  2368. *tl_copy = NULL;
  2369. mlog(0, "recover truncate log from slot %d\n", slot_num);
  2370. status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
  2371. if (status < 0) {
  2372. mlog_errno(status);
  2373. goto bail;
  2374. }
  2375. di = (struct ocfs2_dinode *) tl_bh->b_data;
  2376. tl = &di->id2.i_dealloc;
  2377. if (!OCFS2_IS_VALID_DINODE(di)) {
  2378. OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
  2379. status = -EIO;
  2380. goto bail;
  2381. }
  2382. if (le16_to_cpu(tl->tl_used)) {
  2383. mlog(0, "We'll have %u logs to recover\n",
  2384. le16_to_cpu(tl->tl_used));
  2385. *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
  2386. if (!(*tl_copy)) {
  2387. status = -ENOMEM;
  2388. mlog_errno(status);
  2389. goto bail;
  2390. }
  2391. /* Assuming the write-out below goes well, this copy
  2392. * will be passed back to recovery for processing. */
  2393. memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
  2394. /* All we need to do to clear the truncate log is set
  2395. * tl_used. */
  2396. tl->tl_used = 0;
  2397. status = ocfs2_write_block(osb, tl_bh, tl_inode);
  2398. if (status < 0) {
  2399. mlog_errno(status);
  2400. goto bail;
  2401. }
  2402. }
  2403. bail:
  2404. if (tl_inode)
  2405. iput(tl_inode);
  2406. if (tl_bh)
  2407. brelse(tl_bh);
  2408. if (status < 0 && (*tl_copy)) {
  2409. kfree(*tl_copy);
  2410. *tl_copy = NULL;
  2411. }
  2412. mlog_exit(status);
  2413. return status;
  2414. }
  2415. int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
  2416. struct ocfs2_dinode *tl_copy)
  2417. {
  2418. int status = 0;
  2419. int i;
  2420. unsigned int clusters, num_recs, start_cluster;
  2421. u64 start_blk;
  2422. handle_t *handle;
  2423. struct inode *tl_inode = osb->osb_tl_inode;
  2424. struct ocfs2_truncate_log *tl;
  2425. mlog_entry_void();
  2426. if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
  2427. mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
  2428. return -EINVAL;
  2429. }
  2430. tl = &tl_copy->id2.i_dealloc;
  2431. num_recs = le16_to_cpu(tl->tl_used);
  2432. mlog(0, "cleanup %u records from %llu\n", num_recs,
  2433. (unsigned long long)tl_copy->i_blkno);
  2434. mutex_lock(&tl_inode->i_mutex);
  2435. for(i = 0; i < num_recs; i++) {
  2436. if (ocfs2_truncate_log_needs_flush(osb)) {
  2437. status = __ocfs2_flush_truncate_log(osb);
  2438. if (status < 0) {
  2439. mlog_errno(status);
  2440. goto bail_up;
  2441. }
  2442. }
  2443. handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
  2444. if (IS_ERR(handle)) {
  2445. status = PTR_ERR(handle);
  2446. mlog_errno(status);
  2447. goto bail_up;
  2448. }
  2449. clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
  2450. start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
  2451. start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
  2452. status = ocfs2_truncate_log_append(osb, handle,
  2453. start_blk, clusters);
  2454. ocfs2_commit_trans(osb, handle);
  2455. if (status < 0) {
  2456. mlog_errno(status);
  2457. goto bail_up;
  2458. }
  2459. }
  2460. bail_up:
  2461. mutex_unlock(&tl_inode->i_mutex);
  2462. mlog_exit(status);
  2463. return status;
  2464. }
  2465. void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
  2466. {
  2467. int status;
  2468. struct inode *tl_inode = osb->osb_tl_inode;
  2469. mlog_entry_void();
  2470. if (tl_inode) {
  2471. cancel_delayed_work(&osb->osb_truncate_log_wq);
  2472. flush_workqueue(ocfs2_wq);
  2473. status = ocfs2_flush_truncate_log(osb);
  2474. if (status < 0)
  2475. mlog_errno(status);
  2476. brelse(osb->osb_tl_bh);
  2477. iput(osb->osb_tl_inode);
  2478. }
  2479. mlog_exit_void();
  2480. }
  2481. int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  2482. {
  2483. int status;
  2484. struct inode *tl_inode = NULL;
  2485. struct buffer_head *tl_bh = NULL;
  2486. mlog_entry_void();
  2487. status = ocfs2_get_truncate_log_info(osb,
  2488. osb->slot_num,
  2489. &tl_inode,
  2490. &tl_bh);
  2491. if (status < 0)
  2492. mlog_errno(status);
  2493. /* ocfs2_truncate_log_shutdown keys on the existence of
  2494. * osb->osb_tl_inode so we don't set any of the osb variables
  2495. * until we're sure all is well. */
  2496. INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
  2497. ocfs2_truncate_log_worker);
  2498. osb->osb_tl_bh = tl_bh;
  2499. osb->osb_tl_inode = tl_inode;
  2500. mlog_exit(status);
  2501. return status;
  2502. }
  2503. /* This function will figure out whether the currently last extent
  2504. * block will be deleted, and if it will, what the new last extent
  2505. * block will be so we can update his h_next_leaf_blk field, as well
  2506. * as the dinodes i_last_eb_blk */
  2507. static int ocfs2_find_new_last_ext_blk(struct inode *inode,
  2508. unsigned int clusters_to_del,
  2509. struct ocfs2_path *path,
  2510. struct buffer_head **new_last_eb)
  2511. {
  2512. int next_free, ret = 0;
  2513. u32 cpos;
  2514. struct ocfs2_extent_rec *rec;
  2515. struct ocfs2_extent_block *eb;
  2516. struct ocfs2_extent_list *el;
  2517. struct buffer_head *bh = NULL;
  2518. *new_last_eb = NULL;
  2519. /* we have no tree, so of course, no last_eb. */
  2520. if (!path->p_tree_depth)
  2521. goto out;
  2522. /* trunc to zero special case - this makes tree_depth = 0
  2523. * regardless of what it is. */
  2524. if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
  2525. goto out;
  2526. el = path_leaf_el(path);
  2527. BUG_ON(!el->l_next_free_rec);
  2528. /*
  2529. * Make sure that this extent list will actually be empty
  2530. * after we clear away the data. We can shortcut out if
  2531. * there's more than one non-empty extent in the
  2532. * list. Otherwise, a check of the remaining extent is
  2533. * necessary.
  2534. */
  2535. next_free = le16_to_cpu(el->l_next_free_rec);
  2536. rec = NULL;
  2537. if (ocfs2_is_empty_extent(&el->l_recs[0])) {
  2538. if (next_free > 2)
  2539. goto out;
  2540. /* We may have a valid extent in index 1, check it. */
  2541. if (next_free == 2)
  2542. rec = &el->l_recs[1];
  2543. /*
  2544. * Fall through - no more nonempty extents, so we want
  2545. * to delete this leaf.
  2546. */
  2547. } else {
  2548. if (next_free > 1)
  2549. goto out;
  2550. rec = &el->l_recs[0];
  2551. }
  2552. if (rec) {
  2553. /*
  2554. * Check it we'll only be trimming off the end of this
  2555. * cluster.
  2556. */
  2557. if (le16_to_cpu(rec->e_clusters) > clusters_to_del)
  2558. goto out;
  2559. }
  2560. ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
  2561. if (ret) {
  2562. mlog_errno(ret);
  2563. goto out;
  2564. }
  2565. ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
  2566. if (ret) {
  2567. mlog_errno(ret);
  2568. goto out;
  2569. }
  2570. eb = (struct ocfs2_extent_block *) bh->b_data;
  2571. el = &eb->h_list;
  2572. if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
  2573. OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
  2574. ret = -EROFS;
  2575. goto out;
  2576. }
  2577. *new_last_eb = bh;
  2578. get_bh(*new_last_eb);
  2579. mlog(0, "returning block %llu, (cpos: %u)\n",
  2580. (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
  2581. out:
  2582. brelse(bh);
  2583. return ret;
  2584. }
  2585. /*
  2586. * Trim some clusters off the rightmost edge of a tree. Only called
  2587. * during truncate.
  2588. *
  2589. * The caller needs to:
  2590. * - start journaling of each path component.
  2591. * - compute and fully set up any new last ext block
  2592. */
  2593. static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
  2594. handle_t *handle, struct ocfs2_truncate_context *tc,
  2595. u32 clusters_to_del, u64 *delete_start)
  2596. {
  2597. int ret, i, index = path->p_tree_depth;
  2598. u32 new_edge = 0;
  2599. u64 deleted_eb = 0;
  2600. struct buffer_head *bh;
  2601. struct ocfs2_extent_list *el;
  2602. struct ocfs2_extent_rec *rec;
  2603. *delete_start = 0;
  2604. while (index >= 0) {
  2605. bh = path->p_node[index].bh;
  2606. el = path->p_node[index].el;
  2607. mlog(0, "traveling tree (index = %d, block = %llu)\n",
  2608. index, (unsigned long long)bh->b_blocknr);
  2609. BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
  2610. if (index !=
  2611. (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
  2612. ocfs2_error(inode->i_sb,
  2613. "Inode %lu has invalid ext. block %llu",
  2614. inode->i_ino,
  2615. (unsigned long long)bh->b_blocknr);
  2616. ret = -EROFS;
  2617. goto out;
  2618. }
  2619. find_tail_record:
  2620. i = le16_to_cpu(el->l_next_free_rec) - 1;
  2621. rec = &el->l_recs[i];
  2622. mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
  2623. "next = %u\n", i, le32_to_cpu(rec->e_cpos),
  2624. le32_to_cpu(rec->e_clusters),
  2625. (unsigned long long)le64_to_cpu(rec->e_blkno),
  2626. le16_to_cpu(el->l_next_free_rec));
  2627. BUG_ON(le32_to_cpu(rec->e_clusters) < clusters_to_del);
  2628. if (le16_to_cpu(el->l_tree_depth) == 0) {
  2629. /*
  2630. * If the leaf block contains a single empty
  2631. * extent and no records, we can just remove
  2632. * the block.
  2633. */
  2634. if (i == 0 && ocfs2_is_empty_extent(rec)) {
  2635. memset(rec, 0,
  2636. sizeof(struct ocfs2_extent_rec));
  2637. el->l_next_free_rec = cpu_to_le16(0);
  2638. goto delete;
  2639. }
  2640. /*
  2641. * Remove any empty extents by shifting things
  2642. * left. That should make life much easier on
  2643. * the code below. This condition is rare
  2644. * enough that we shouldn't see a performance
  2645. * hit.
  2646. */
  2647. if (ocfs2_is_empty_extent(&el->l_recs[0])) {
  2648. le16_add_cpu(&el->l_next_free_rec, -1);
  2649. for(i = 0;
  2650. i < le16_to_cpu(el->l_next_free_rec); i++)
  2651. el->l_recs[i] = el->l_recs[i + 1];
  2652. memset(&el->l_recs[i], 0,
  2653. sizeof(struct ocfs2_extent_rec));
  2654. /*
  2655. * We've modified our extent list. The
  2656. * simplest way to handle this change
  2657. * is to being the search from the
  2658. * start again.
  2659. */
  2660. goto find_tail_record;
  2661. }
  2662. le32_add_cpu(&rec->e_clusters, -clusters_to_del);
  2663. /*
  2664. * We'll use "new_edge" on our way back up the
  2665. * tree to know what our rightmost cpos is.
  2666. */
  2667. new_edge = le32_to_cpu(rec->e_clusters);
  2668. new_edge += le32_to_cpu(rec->e_cpos);
  2669. /*
  2670. * The caller will use this to delete data blocks.
  2671. */
  2672. *delete_start = le64_to_cpu(rec->e_blkno)
  2673. + ocfs2_clusters_to_blocks(inode->i_sb,
  2674. le32_to_cpu(rec->e_clusters));
  2675. /*
  2676. * If it's now empty, remove this record.
  2677. */
  2678. if (le32_to_cpu(rec->e_clusters) == 0) {
  2679. memset(rec, 0,
  2680. sizeof(struct ocfs2_extent_rec));
  2681. le16_add_cpu(&el->l_next_free_rec, -1);
  2682. }
  2683. } else {
  2684. if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
  2685. memset(rec, 0,
  2686. sizeof(struct ocfs2_extent_rec));
  2687. le16_add_cpu(&el->l_next_free_rec, -1);
  2688. goto delete;
  2689. }
  2690. /* Can this actually happen? */
  2691. if (le16_to_cpu(el->l_next_free_rec) == 0)
  2692. goto delete;
  2693. /*
  2694. * We never actually deleted any clusters
  2695. * because our leaf was empty. There's no
  2696. * reason to adjust the rightmost edge then.
  2697. */
  2698. if (new_edge == 0)
  2699. goto delete;
  2700. rec->e_clusters = cpu_to_le32(new_edge);
  2701. le32_add_cpu(&rec->e_clusters,
  2702. -le32_to_cpu(rec->e_cpos));
  2703. /*
  2704. * A deleted child record should have been
  2705. * caught above.
  2706. */
  2707. BUG_ON(le32_to_cpu(rec->e_clusters) == 0);
  2708. }
  2709. delete:
  2710. ret = ocfs2_journal_dirty(handle, bh);
  2711. if (ret) {
  2712. mlog_errno(ret);
  2713. goto out;
  2714. }
  2715. mlog(0, "extent list container %llu, after: record %d: "
  2716. "(%u, %u, %llu), next = %u.\n",
  2717. (unsigned long long)bh->b_blocknr, i,
  2718. le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
  2719. (unsigned long long)le64_to_cpu(rec->e_blkno),
  2720. le16_to_cpu(el->l_next_free_rec));
  2721. /*
  2722. * We must be careful to only attempt delete of an
  2723. * extent block (and not the root inode block).
  2724. */
  2725. if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
  2726. struct ocfs2_extent_block *eb =
  2727. (struct ocfs2_extent_block *)bh->b_data;
  2728. /*
  2729. * Save this for use when processing the
  2730. * parent block.
  2731. */
  2732. deleted_eb = le64_to_cpu(eb->h_blkno);
  2733. mlog(0, "deleting this extent block.\n");
  2734. ocfs2_remove_from_cache(inode, bh);
  2735. BUG_ON(le32_to_cpu(el->l_recs[0].e_clusters));
  2736. BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
  2737. BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
  2738. if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
  2739. /*
  2740. * This code only understands how to
  2741. * lock the suballocator in slot 0,
  2742. * which is fine because allocation is
  2743. * only ever done out of that
  2744. * suballocator too. A future version
  2745. * might change that however, so avoid
  2746. * a free if we don't know how to
  2747. * handle it. This way an fs incompat
  2748. * bit will not be necessary.
  2749. */
  2750. ret = ocfs2_free_extent_block(handle,
  2751. tc->tc_ext_alloc_inode,
  2752. tc->tc_ext_alloc_bh,
  2753. eb);
  2754. /* An error here is not fatal. */
  2755. if (ret < 0)
  2756. mlog_errno(ret);
  2757. }
  2758. } else {
  2759. deleted_eb = 0;
  2760. }
  2761. index--;
  2762. }
  2763. ret = 0;
  2764. out:
  2765. return ret;
  2766. }
  2767. static int ocfs2_do_truncate(struct ocfs2_super *osb,
  2768. unsigned int clusters_to_del,
  2769. struct inode *inode,
  2770. struct buffer_head *fe_bh,
  2771. handle_t *handle,
  2772. struct ocfs2_truncate_context *tc,
  2773. struct ocfs2_path *path)
  2774. {
  2775. int status;
  2776. struct ocfs2_dinode *fe;
  2777. struct ocfs2_extent_block *last_eb = NULL;
  2778. struct ocfs2_extent_list *el;
  2779. struct buffer_head *last_eb_bh = NULL;
  2780. u64 delete_blk = 0;
  2781. fe = (struct ocfs2_dinode *) fe_bh->b_data;
  2782. status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
  2783. path, &last_eb_bh);
  2784. if (status < 0) {
  2785. mlog_errno(status);
  2786. goto bail;
  2787. }
  2788. /*
  2789. * Each component will be touched, so we might as well journal
  2790. * here to avoid having to handle errors later.
  2791. */
  2792. status = ocfs2_journal_access_path(inode, handle, path);
  2793. if (status < 0) {
  2794. mlog_errno(status);
  2795. goto bail;
  2796. }
  2797. if (last_eb_bh) {
  2798. status = ocfs2_journal_access(handle, inode, last_eb_bh,
  2799. OCFS2_JOURNAL_ACCESS_WRITE);
  2800. if (status < 0) {
  2801. mlog_errno(status);
  2802. goto bail;
  2803. }
  2804. last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
  2805. }
  2806. el = &(fe->id2.i_list);
  2807. /*
  2808. * Lower levels depend on this never happening, but it's best
  2809. * to check it up here before changing the tree.
  2810. */
  2811. if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
  2812. ocfs2_error(inode->i_sb,
  2813. "Inode %lu has an empty extent record, depth %u\n",
  2814. inode->i_ino, le16_to_cpu(el->l_tree_depth));
  2815. status = -EROFS;
  2816. goto bail;
  2817. }
  2818. spin_lock(&OCFS2_I(inode)->ip_lock);
  2819. OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
  2820. clusters_to_del;
  2821. spin_unlock(&OCFS2_I(inode)->ip_lock);
  2822. le32_add_cpu(&fe->i_clusters, -clusters_to_del);
  2823. status = ocfs2_trim_tree(inode, path, handle, tc,
  2824. clusters_to_del, &delete_blk);
  2825. if (status) {
  2826. mlog_errno(status);
  2827. goto bail;
  2828. }
  2829. if (le32_to_cpu(fe->i_clusters) == 0) {
  2830. /* trunc to zero is a special case. */
  2831. el->l_tree_depth = 0;
  2832. fe->i_last_eb_blk = 0;
  2833. } else if (last_eb)
  2834. fe->i_last_eb_blk = last_eb->h_blkno;
  2835. status = ocfs2_journal_dirty(handle, fe_bh);
  2836. if (status < 0) {
  2837. mlog_errno(status);
  2838. goto bail;
  2839. }
  2840. if (last_eb) {
  2841. /* If there will be a new last extent block, then by
  2842. * definition, there cannot be any leaves to the right of
  2843. * him. */
  2844. last_eb->h_next_leaf_blk = 0;
  2845. status = ocfs2_journal_dirty(handle, last_eb_bh);
  2846. if (status < 0) {
  2847. mlog_errno(status);
  2848. goto bail;
  2849. }
  2850. }
  2851. if (delete_blk) {
  2852. status = ocfs2_truncate_log_append(osb, handle, delete_blk,
  2853. clusters_to_del);
  2854. if (status < 0) {
  2855. mlog_errno(status);
  2856. goto bail;
  2857. }
  2858. }
  2859. status = 0;
  2860. bail:
  2861. mlog_exit(status);
  2862. return status;
  2863. }
  2864. /*
  2865. * It is expected, that by the time you call this function,
  2866. * inode->i_size and fe->i_size have been adjusted.
  2867. *
  2868. * WARNING: This will kfree the truncate context
  2869. */
  2870. int ocfs2_commit_truncate(struct ocfs2_super *osb,
  2871. struct inode *inode,
  2872. struct buffer_head *fe_bh,
  2873. struct ocfs2_truncate_context *tc)
  2874. {
  2875. int status, i, credits, tl_sem = 0;
  2876. u32 clusters_to_del, new_highest_cpos, range;
  2877. struct ocfs2_extent_list *el;
  2878. handle_t *handle = NULL;
  2879. struct inode *tl_inode = osb->osb_tl_inode;
  2880. struct ocfs2_path *path = NULL;
  2881. mlog_entry_void();
  2882. down_write(&OCFS2_I(inode)->ip_alloc_sem);
  2883. new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
  2884. i_size_read(inode));
  2885. path = ocfs2_new_inode_path(fe_bh);
  2886. if (!path) {
  2887. status = -ENOMEM;
  2888. mlog_errno(status);
  2889. goto bail;
  2890. }
  2891. start:
  2892. /*
  2893. * Check that we still have allocation to delete.
  2894. */
  2895. if (OCFS2_I(inode)->ip_clusters == 0) {
  2896. status = 0;
  2897. goto bail;
  2898. }
  2899. /*
  2900. * Truncate always works against the rightmost tree branch.
  2901. */
  2902. status = ocfs2_find_path(inode, path, UINT_MAX);
  2903. if (status) {
  2904. mlog_errno(status);
  2905. goto bail;
  2906. }
  2907. mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
  2908. OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
  2909. /*
  2910. * By now, el will point to the extent list on the bottom most
  2911. * portion of this tree. Only the tail record is considered in
  2912. * each pass.
  2913. *
  2914. * We handle the following cases, in order:
  2915. * - empty extent: delete the remaining branch
  2916. * - remove the entire record
  2917. * - remove a partial record
  2918. * - no record needs to be removed (truncate has completed)
  2919. */
  2920. el = path_leaf_el(path);
  2921. if (le16_to_cpu(el->l_next_free_rec) == 0) {
  2922. ocfs2_error(inode->i_sb,
  2923. "Inode %llu has empty extent block at %llu\n",
  2924. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  2925. (unsigned long long)path_leaf_bh(path)->b_blocknr);
  2926. status = -EROFS;
  2927. goto bail;
  2928. }
  2929. i = le16_to_cpu(el->l_next_free_rec) - 1;
  2930. range = le32_to_cpu(el->l_recs[i].e_cpos) +
  2931. le32_to_cpu(el->l_recs[i].e_clusters);
  2932. if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
  2933. clusters_to_del = 0;
  2934. } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
  2935. clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
  2936. } else if (range > new_highest_cpos) {
  2937. clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
  2938. le32_to_cpu(el->l_recs[i].e_cpos)) -
  2939. new_highest_cpos;
  2940. } else {
  2941. status = 0;
  2942. goto bail;
  2943. }
  2944. mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
  2945. clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
  2946. BUG_ON(clusters_to_del == 0);
  2947. mutex_lock(&tl_inode->i_mutex);
  2948. tl_sem = 1;
  2949. /* ocfs2_truncate_log_needs_flush guarantees us at least one
  2950. * record is free for use. If there isn't any, we flush to get
  2951. * an empty truncate log. */
  2952. if (ocfs2_truncate_log_needs_flush(osb)) {
  2953. status = __ocfs2_flush_truncate_log(osb);
  2954. if (status < 0) {
  2955. mlog_errno(status);
  2956. goto bail;
  2957. }
  2958. }
  2959. credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
  2960. (struct ocfs2_dinode *)fe_bh->b_data,
  2961. el);
  2962. handle = ocfs2_start_trans(osb, credits);
  2963. if (IS_ERR(handle)) {
  2964. status = PTR_ERR(handle);
  2965. handle = NULL;
  2966. mlog_errno(status);
  2967. goto bail;
  2968. }
  2969. status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
  2970. tc, path);
  2971. if (status < 0) {
  2972. mlog_errno(status);
  2973. goto bail;
  2974. }
  2975. mutex_unlock(&tl_inode->i_mutex);
  2976. tl_sem = 0;
  2977. ocfs2_commit_trans(osb, handle);
  2978. handle = NULL;
  2979. ocfs2_reinit_path(path, 1);
  2980. /*
  2981. * The check above will catch the case where we've truncated
  2982. * away all allocation.
  2983. */
  2984. goto start;
  2985. bail:
  2986. up_write(&OCFS2_I(inode)->ip_alloc_sem);
  2987. ocfs2_schedule_truncate_log_flush(osb, 1);
  2988. if (tl_sem)
  2989. mutex_unlock(&tl_inode->i_mutex);
  2990. if (handle)
  2991. ocfs2_commit_trans(osb, handle);
  2992. ocfs2_free_path(path);
  2993. /* This will drop the ext_alloc cluster lock for us */
  2994. ocfs2_free_truncate_context(tc);
  2995. mlog_exit(status);
  2996. return status;
  2997. }
  2998. /*
  2999. * Expects the inode to already be locked. This will figure out which
  3000. * inodes need to be locked and will put them on the returned truncate
  3001. * context.
  3002. */
  3003. int ocfs2_prepare_truncate(struct ocfs2_super *osb,
  3004. struct inode *inode,
  3005. struct buffer_head *fe_bh,
  3006. struct ocfs2_truncate_context **tc)
  3007. {
  3008. int status, metadata_delete, i;
  3009. unsigned int new_i_clusters;
  3010. struct ocfs2_dinode *fe;
  3011. struct ocfs2_extent_block *eb;
  3012. struct ocfs2_extent_list *el;
  3013. struct buffer_head *last_eb_bh = NULL;
  3014. struct inode *ext_alloc_inode = NULL;
  3015. struct buffer_head *ext_alloc_bh = NULL;
  3016. mlog_entry_void();
  3017. *tc = NULL;
  3018. new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
  3019. i_size_read(inode));
  3020. fe = (struct ocfs2_dinode *) fe_bh->b_data;
  3021. mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
  3022. "%llu\n", fe->i_clusters, new_i_clusters,
  3023. (unsigned long long)fe->i_size);
  3024. *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
  3025. if (!(*tc)) {
  3026. status = -ENOMEM;
  3027. mlog_errno(status);
  3028. goto bail;
  3029. }
  3030. metadata_delete = 0;
  3031. if (fe->id2.i_list.l_tree_depth) {
  3032. /* If we have a tree, then the truncate may result in
  3033. * metadata deletes. Figure this out from the
  3034. * rightmost leaf block.*/
  3035. status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
  3036. &last_eb_bh, OCFS2_BH_CACHED, inode);
  3037. if (status < 0) {
  3038. mlog_errno(status);
  3039. goto bail;
  3040. }
  3041. eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
  3042. if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
  3043. OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
  3044. brelse(last_eb_bh);
  3045. status = -EIO;
  3046. goto bail;
  3047. }
  3048. el = &(eb->h_list);
  3049. i = 0;
  3050. if (ocfs2_is_empty_extent(&el->l_recs[0]))
  3051. i = 1;
  3052. /*
  3053. * XXX: Should we check that next_free_rec contains
  3054. * the extent?
  3055. */
  3056. if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
  3057. metadata_delete = 1;
  3058. }
  3059. (*tc)->tc_last_eb_bh = last_eb_bh;
  3060. if (metadata_delete) {
  3061. mlog(0, "Will have to delete metadata for this trunc. "
  3062. "locking allocator.\n");
  3063. ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
  3064. if (!ext_alloc_inode) {
  3065. status = -ENOMEM;
  3066. mlog_errno(status);
  3067. goto bail;
  3068. }
  3069. mutex_lock(&ext_alloc_inode->i_mutex);
  3070. (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
  3071. status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
  3072. if (status < 0) {
  3073. mlog_errno(status);
  3074. goto bail;
  3075. }
  3076. (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
  3077. (*tc)->tc_ext_alloc_locked = 1;
  3078. }
  3079. status = 0;
  3080. bail:
  3081. if (status < 0) {
  3082. if (*tc)
  3083. ocfs2_free_truncate_context(*tc);
  3084. *tc = NULL;
  3085. }
  3086. mlog_exit_void();
  3087. return status;
  3088. }
  3089. static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
  3090. {
  3091. if (tc->tc_ext_alloc_inode) {
  3092. if (tc->tc_ext_alloc_locked)
  3093. ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
  3094. mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
  3095. iput(tc->tc_ext_alloc_inode);
  3096. }
  3097. if (tc->tc_ext_alloc_bh)
  3098. brelse(tc->tc_ext_alloc_bh);
  3099. if (tc->tc_last_eb_bh)
  3100. brelse(tc->tc_last_eb_bh);
  3101. kfree(tc);
  3102. }