fib_trie.c 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610
  1. /*
  2. * This program is free software; you can redistribute it and/or
  3. * modify it under the terms of the GNU General Public License
  4. * as published by the Free Software Foundation; either version
  5. * 2 of the License, or (at your option) any later version.
  6. *
  7. * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
  8. * & Swedish University of Agricultural Sciences.
  9. *
  10. * Jens Laas <jens.laas@data.slu.se> Swedish University of
  11. * Agricultural Sciences.
  12. *
  13. * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
  14. *
  15. * This work is based on the LPC-trie which is originally descibed in:
  16. *
  17. * An experimental study of compression methods for dynamic tries
  18. * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19. * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
  20. *
  21. *
  22. * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23. * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24. *
  25. * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
  26. *
  27. *
  28. * Code from fib_hash has been reused which includes the following header:
  29. *
  30. *
  31. * INET An implementation of the TCP/IP protocol suite for the LINUX
  32. * operating system. INET is implemented using the BSD Socket
  33. * interface as the means of communication with the user level.
  34. *
  35. * IPv4 FIB: lookup engine and maintenance routines.
  36. *
  37. *
  38. * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  39. *
  40. * This program is free software; you can redistribute it and/or
  41. * modify it under the terms of the GNU General Public License
  42. * as published by the Free Software Foundation; either version
  43. * 2 of the License, or (at your option) any later version.
  44. */
  45. #define VERSION "0.325"
  46. #include <linux/config.h>
  47. #include <asm/uaccess.h>
  48. #include <asm/system.h>
  49. #include <asm/bitops.h>
  50. #include <linux/types.h>
  51. #include <linux/kernel.h>
  52. #include <linux/sched.h>
  53. #include <linux/mm.h>
  54. #include <linux/string.h>
  55. #include <linux/socket.h>
  56. #include <linux/sockios.h>
  57. #include <linux/errno.h>
  58. #include <linux/in.h>
  59. #include <linux/inet.h>
  60. #include <linux/netdevice.h>
  61. #include <linux/if_arp.h>
  62. #include <linux/proc_fs.h>
  63. #include <linux/skbuff.h>
  64. #include <linux/netlink.h>
  65. #include <linux/init.h>
  66. #include <linux/list.h>
  67. #include <net/ip.h>
  68. #include <net/protocol.h>
  69. #include <net/route.h>
  70. #include <net/tcp.h>
  71. #include <net/sock.h>
  72. #include <net/ip_fib.h>
  73. #include "fib_lookup.h"
  74. #undef CONFIG_IP_FIB_TRIE_STATS
  75. #define MAX_CHILDS 16384
  76. #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
  77. #define KEYLENGTH (8*sizeof(t_key))
  78. #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
  79. #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
  80. static DEFINE_RWLOCK(fib_lock);
  81. typedef unsigned int t_key;
  82. #define T_TNODE 0
  83. #define T_LEAF 1
  84. #define NODE_TYPE_MASK 0x1UL
  85. #define NODE_PARENT(_node) \
  86. ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
  87. #define NODE_SET_PARENT(_node, _ptr) \
  88. ((_node)->_parent = (((unsigned long)(_ptr)) | \
  89. ((_node)->_parent & NODE_TYPE_MASK)))
  90. #define NODE_INIT_PARENT(_node, _type) \
  91. ((_node)->_parent = (_type))
  92. #define NODE_TYPE(_node) \
  93. ((_node)->_parent & NODE_TYPE_MASK)
  94. #define IS_TNODE(n) (!(n->_parent & T_LEAF))
  95. #define IS_LEAF(n) (n->_parent & T_LEAF)
  96. struct node {
  97. t_key key;
  98. unsigned long _parent;
  99. };
  100. struct leaf {
  101. t_key key;
  102. unsigned long _parent;
  103. struct hlist_head list;
  104. };
  105. struct leaf_info {
  106. struct hlist_node hlist;
  107. int plen;
  108. struct list_head falh;
  109. };
  110. struct tnode {
  111. t_key key;
  112. unsigned long _parent;
  113. unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
  114. unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
  115. unsigned short full_children; /* KEYLENGTH bits needed */
  116. unsigned short empty_children; /* KEYLENGTH bits needed */
  117. struct node *child[0];
  118. };
  119. #ifdef CONFIG_IP_FIB_TRIE_STATS
  120. struct trie_use_stats {
  121. unsigned int gets;
  122. unsigned int backtrack;
  123. unsigned int semantic_match_passed;
  124. unsigned int semantic_match_miss;
  125. unsigned int null_node_hit;
  126. unsigned int resize_node_skipped;
  127. };
  128. #endif
  129. struct trie_stat {
  130. unsigned int totdepth;
  131. unsigned int maxdepth;
  132. unsigned int tnodes;
  133. unsigned int leaves;
  134. unsigned int nullpointers;
  135. unsigned int nodesizes[MAX_CHILDS];
  136. };
  137. struct trie {
  138. struct node *trie;
  139. #ifdef CONFIG_IP_FIB_TRIE_STATS
  140. struct trie_use_stats stats;
  141. #endif
  142. int size;
  143. unsigned int revision;
  144. };
  145. static int trie_debug = 0;
  146. static int tnode_full(struct tnode *tn, struct node *n);
  147. static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
  148. static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
  149. static int tnode_child_length(struct tnode *tn);
  150. static struct node *resize(struct trie *t, struct tnode *tn);
  151. static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
  152. static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
  153. static void tnode_free(struct tnode *tn);
  154. static void trie_dump_seq(struct seq_file *seq, struct trie *t);
  155. extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
  156. extern int fib_detect_death(struct fib_info *fi, int order,
  157. struct fib_info **last_resort, int *last_idx, int *dflt);
  158. extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
  159. struct nlmsghdr *n, struct netlink_skb_parms *req);
  160. static kmem_cache_t *fn_alias_kmem;
  161. static struct trie *trie_local = NULL, *trie_main = NULL;
  162. static void trie_bug(char *err)
  163. {
  164. printk("Trie Bug: %s\n", err);
  165. BUG();
  166. }
  167. static inline struct node *tnode_get_child(struct tnode *tn, int i)
  168. {
  169. if (i >= 1<<tn->bits)
  170. trie_bug("tnode_get_child");
  171. return tn->child[i];
  172. }
  173. static inline int tnode_child_length(struct tnode *tn)
  174. {
  175. return 1<<tn->bits;
  176. }
  177. /*
  178. _________________________________________________________________
  179. | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  180. ----------------------------------------------------------------
  181. 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  182. _________________________________________________________________
  183. | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  184. -----------------------------------------------------------------
  185. 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
  186. tp->pos = 7
  187. tp->bits = 3
  188. n->pos = 15
  189. n->bits=4
  190. KEYLENGTH=32
  191. */
  192. static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
  193. {
  194. if (offset < KEYLENGTH)
  195. return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
  196. else
  197. return 0;
  198. }
  199. static inline int tkey_equals(t_key a, t_key b)
  200. {
  201. return a == b;
  202. }
  203. static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
  204. {
  205. if (bits == 0 || offset >= KEYLENGTH)
  206. return 1;
  207. bits = bits > KEYLENGTH ? KEYLENGTH : bits;
  208. return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
  209. }
  210. static inline int tkey_mismatch(t_key a, int offset, t_key b)
  211. {
  212. t_key diff = a ^ b;
  213. int i = offset;
  214. if(!diff)
  215. return 0;
  216. while((diff << i) >> (KEYLENGTH-1) == 0)
  217. i++;
  218. return i;
  219. }
  220. /* Candiate for fib_semantics */
  221. static void fn_free_alias(struct fib_alias *fa)
  222. {
  223. fib_release_info(fa->fa_info);
  224. kmem_cache_free(fn_alias_kmem, fa);
  225. }
  226. /*
  227. To understand this stuff, an understanding of keys and all their bits is
  228. necessary. Every node in the trie has a key associated with it, but not
  229. all of the bits in that key are significant.
  230. Consider a node 'n' and its parent 'tp'.
  231. If n is a leaf, every bit in its key is significant. Its presence is
  232. necessitaded by path compression, since during a tree traversal (when
  233. searching for a leaf - unless we are doing an insertion) we will completely
  234. ignore all skipped bits we encounter. Thus we need to verify, at the end of
  235. a potentially successful search, that we have indeed been walking the
  236. correct key path.
  237. Note that we can never "miss" the correct key in the tree if present by
  238. following the wrong path. Path compression ensures that segments of the key
  239. that are the same for all keys with a given prefix are skipped, but the
  240. skipped part *is* identical for each node in the subtrie below the skipped
  241. bit! trie_insert() in this implementation takes care of that - note the
  242. call to tkey_sub_equals() in trie_insert().
  243. if n is an internal node - a 'tnode' here, the various parts of its key
  244. have many different meanings.
  245. Example:
  246. _________________________________________________________________
  247. | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  248. -----------------------------------------------------------------
  249. 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
  250. _________________________________________________________________
  251. | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  252. -----------------------------------------------------------------
  253. 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
  254. tp->pos = 7
  255. tp->bits = 3
  256. n->pos = 15
  257. n->bits=4
  258. First, let's just ignore the bits that come before the parent tp, that is
  259. the bits from 0 to (tp->pos-1). They are *known* but at this point we do
  260. not use them for anything.
  261. The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
  262. index into the parent's child array. That is, they will be used to find
  263. 'n' among tp's children.
  264. The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  265. for the node n.
  266. All the bits we have seen so far are significant to the node n. The rest
  267. of the bits are really not needed or indeed known in n->key.
  268. The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
  269. n's child array, and will of course be different for each child.
  270. The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  271. at this point.
  272. */
  273. static void check_tnode(struct tnode *tn)
  274. {
  275. if(tn && tn->pos+tn->bits > 32) {
  276. printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
  277. }
  278. }
  279. static int halve_threshold = 25;
  280. static int inflate_threshold = 50;
  281. static struct leaf *leaf_new(void)
  282. {
  283. struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
  284. if(l) {
  285. NODE_INIT_PARENT(l, T_LEAF);
  286. INIT_HLIST_HEAD(&l->list);
  287. }
  288. return l;
  289. }
  290. static struct leaf_info *leaf_info_new(int plen)
  291. {
  292. struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
  293. if(li) {
  294. li->plen = plen;
  295. INIT_LIST_HEAD(&li->falh);
  296. }
  297. return li;
  298. }
  299. static inline void free_leaf(struct leaf *l)
  300. {
  301. kfree(l);
  302. }
  303. static inline void free_leaf_info(struct leaf_info *li)
  304. {
  305. kfree(li);
  306. }
  307. static struct tnode *tnode_alloc(unsigned int size)
  308. {
  309. if (size <= PAGE_SIZE) {
  310. return kmalloc(size, GFP_KERNEL);
  311. } else {
  312. return (struct tnode *)
  313. __get_free_pages(GFP_KERNEL, get_order(size));
  314. }
  315. }
  316. static void __tnode_free(struct tnode *tn)
  317. {
  318. unsigned int size = sizeof(struct tnode) +
  319. (1<<tn->bits) * sizeof(struct node *);
  320. if (size <= PAGE_SIZE)
  321. kfree(tn);
  322. else
  323. free_pages((unsigned long)tn, get_order(size));
  324. }
  325. static struct tnode* tnode_new(t_key key, int pos, int bits)
  326. {
  327. int nchildren = 1<<bits;
  328. int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
  329. struct tnode *tn = tnode_alloc(sz);
  330. if(tn) {
  331. memset(tn, 0, sz);
  332. NODE_INIT_PARENT(tn, T_TNODE);
  333. tn->pos = pos;
  334. tn->bits = bits;
  335. tn->key = key;
  336. tn->full_children = 0;
  337. tn->empty_children = 1<<bits;
  338. }
  339. if(trie_debug > 0)
  340. printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
  341. (unsigned int) (sizeof(struct node) * 1<<bits));
  342. return tn;
  343. }
  344. static void tnode_free(struct tnode *tn)
  345. {
  346. if(!tn) {
  347. trie_bug("tnode_free\n");
  348. }
  349. if(IS_LEAF(tn)) {
  350. free_leaf((struct leaf *)tn);
  351. if(trie_debug > 0 )
  352. printk("FL %p \n", tn);
  353. }
  354. else if(IS_TNODE(tn)) {
  355. __tnode_free(tn);
  356. if(trie_debug > 0 )
  357. printk("FT %p \n", tn);
  358. }
  359. else {
  360. trie_bug("tnode_free\n");
  361. }
  362. }
  363. /*
  364. * Check whether a tnode 'n' is "full", i.e. it is an internal node
  365. * and no bits are skipped. See discussion in dyntree paper p. 6
  366. */
  367. static inline int tnode_full(struct tnode *tn, struct node *n)
  368. {
  369. if(n == NULL || IS_LEAF(n))
  370. return 0;
  371. return ((struct tnode *) n)->pos == tn->pos + tn->bits;
  372. }
  373. static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
  374. {
  375. tnode_put_child_reorg(tn, i, n, -1);
  376. }
  377. /*
  378. * Add a child at position i overwriting the old value.
  379. * Update the value of full_children and empty_children.
  380. */
  381. static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
  382. {
  383. struct node *chi;
  384. int isfull;
  385. if(i >= 1<<tn->bits) {
  386. printk("bits=%d, i=%d\n", tn->bits, i);
  387. trie_bug("tnode_put_child_reorg bits");
  388. }
  389. write_lock_bh(&fib_lock);
  390. chi = tn->child[i];
  391. /* update emptyChildren */
  392. if (n == NULL && chi != NULL)
  393. tn->empty_children++;
  394. else if (n != NULL && chi == NULL)
  395. tn->empty_children--;
  396. /* update fullChildren */
  397. if (wasfull == -1)
  398. wasfull = tnode_full(tn, chi);
  399. isfull = tnode_full(tn, n);
  400. if (wasfull && !isfull)
  401. tn->full_children--;
  402. else if (!wasfull && isfull)
  403. tn->full_children++;
  404. if(n)
  405. NODE_SET_PARENT(n, tn);
  406. tn->child[i] = n;
  407. write_unlock_bh(&fib_lock);
  408. }
  409. static struct node *resize(struct trie *t, struct tnode *tn)
  410. {
  411. int i;
  412. int err = 0;
  413. if (!tn)
  414. return NULL;
  415. if(trie_debug)
  416. printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
  417. tn, inflate_threshold, halve_threshold);
  418. /* No children */
  419. if (tn->empty_children == tnode_child_length(tn)) {
  420. tnode_free(tn);
  421. return NULL;
  422. }
  423. /* One child */
  424. if (tn->empty_children == tnode_child_length(tn) - 1)
  425. for (i = 0; i < tnode_child_length(tn); i++) {
  426. write_lock_bh(&fib_lock);
  427. if (tn->child[i] != NULL) {
  428. /* compress one level */
  429. struct node *n = tn->child[i];
  430. if(n)
  431. NODE_INIT_PARENT(n, NODE_TYPE(n));
  432. write_unlock_bh(&fib_lock);
  433. tnode_free(tn);
  434. return n;
  435. }
  436. write_unlock_bh(&fib_lock);
  437. }
  438. /*
  439. * Double as long as the resulting node has a number of
  440. * nonempty nodes that are above the threshold.
  441. */
  442. /*
  443. * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
  444. * the Helsinki University of Technology and Matti Tikkanen of Nokia
  445. * Telecommunications, page 6:
  446. * "A node is doubled if the ratio of non-empty children to all
  447. * children in the *doubled* node is at least 'high'."
  448. *
  449. * 'high' in this instance is the variable 'inflate_threshold'. It
  450. * is expressed as a percentage, so we multiply it with
  451. * tnode_child_length() and instead of multiplying by 2 (since the
  452. * child array will be doubled by inflate()) and multiplying
  453. * the left-hand side by 100 (to handle the percentage thing) we
  454. * multiply the left-hand side by 50.
  455. *
  456. * The left-hand side may look a bit weird: tnode_child_length(tn)
  457. * - tn->empty_children is of course the number of non-null children
  458. * in the current node. tn->full_children is the number of "full"
  459. * children, that is non-null tnodes with a skip value of 0.
  460. * All of those will be doubled in the resulting inflated tnode, so
  461. * we just count them one extra time here.
  462. *
  463. * A clearer way to write this would be:
  464. *
  465. * to_be_doubled = tn->full_children;
  466. * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
  467. * tn->full_children;
  468. *
  469. * new_child_length = tnode_child_length(tn) * 2;
  470. *
  471. * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
  472. * new_child_length;
  473. * if (new_fill_factor >= inflate_threshold)
  474. *
  475. * ...and so on, tho it would mess up the while() loop.
  476. *
  477. * anyway,
  478. * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
  479. * inflate_threshold
  480. *
  481. * avoid a division:
  482. * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
  483. * inflate_threshold * new_child_length
  484. *
  485. * expand not_to_be_doubled and to_be_doubled, and shorten:
  486. * 100 * (tnode_child_length(tn) - tn->empty_children +
  487. * tn->full_children ) >= inflate_threshold * new_child_length
  488. *
  489. * expand new_child_length:
  490. * 100 * (tnode_child_length(tn) - tn->empty_children +
  491. * tn->full_children ) >=
  492. * inflate_threshold * tnode_child_length(tn) * 2
  493. *
  494. * shorten again:
  495. * 50 * (tn->full_children + tnode_child_length(tn) -
  496. * tn->empty_children ) >= inflate_threshold *
  497. * tnode_child_length(tn)
  498. *
  499. */
  500. check_tnode(tn);
  501. err = 0;
  502. while ((tn->full_children > 0 &&
  503. 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
  504. inflate_threshold * tnode_child_length(tn))) {
  505. tn = inflate(t, tn, &err);
  506. if(err) {
  507. #ifdef CONFIG_IP_FIB_TRIE_STATS
  508. t->stats.resize_node_skipped++;
  509. #endif
  510. break;
  511. }
  512. }
  513. check_tnode(tn);
  514. /*
  515. * Halve as long as the number of empty children in this
  516. * node is above threshold.
  517. */
  518. err = 0;
  519. while (tn->bits > 1 &&
  520. 100 * (tnode_child_length(tn) - tn->empty_children) <
  521. halve_threshold * tnode_child_length(tn)) {
  522. tn = halve(t, tn, &err);
  523. if(err) {
  524. #ifdef CONFIG_IP_FIB_TRIE_STATS
  525. t->stats.resize_node_skipped++;
  526. #endif
  527. break;
  528. }
  529. }
  530. /* Only one child remains */
  531. if (tn->empty_children == tnode_child_length(tn) - 1)
  532. for (i = 0; i < tnode_child_length(tn); i++) {
  533. write_lock_bh(&fib_lock);
  534. if (tn->child[i] != NULL) {
  535. /* compress one level */
  536. struct node *n = tn->child[i];
  537. if(n)
  538. NODE_INIT_PARENT(n, NODE_TYPE(n));
  539. write_unlock_bh(&fib_lock);
  540. tnode_free(tn);
  541. return n;
  542. }
  543. write_unlock_bh(&fib_lock);
  544. }
  545. return (struct node *) tn;
  546. }
  547. static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
  548. {
  549. struct tnode *inode;
  550. struct tnode *oldtnode = tn;
  551. int olen = tnode_child_length(tn);
  552. int i;
  553. if(trie_debug)
  554. printk("In inflate\n");
  555. tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
  556. if (!tn) {
  557. *err = -ENOMEM;
  558. return oldtnode;
  559. }
  560. /*
  561. * Preallocate and store tnodes before the actual work so we
  562. * don't get into an inconsistent state if memory allocation
  563. * fails. In case of failure we return the oldnode and inflate
  564. * of tnode is ignored.
  565. */
  566. for(i = 0; i < olen; i++) {
  567. struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
  568. if (inode &&
  569. IS_TNODE(inode) &&
  570. inode->pos == oldtnode->pos + oldtnode->bits &&
  571. inode->bits > 1) {
  572. struct tnode *left, *right;
  573. t_key m = TKEY_GET_MASK(inode->pos, 1);
  574. left = tnode_new(inode->key&(~m), inode->pos + 1,
  575. inode->bits - 1);
  576. if(!left) {
  577. *err = -ENOMEM;
  578. break;
  579. }
  580. right = tnode_new(inode->key|m, inode->pos + 1,
  581. inode->bits - 1);
  582. if(!right) {
  583. *err = -ENOMEM;
  584. break;
  585. }
  586. put_child(t, tn, 2*i, (struct node *) left);
  587. put_child(t, tn, 2*i+1, (struct node *) right);
  588. }
  589. }
  590. if(*err) {
  591. int size = tnode_child_length(tn);
  592. int j;
  593. for(j = 0; j < size; j++)
  594. if( tn->child[j])
  595. tnode_free((struct tnode *)tn->child[j]);
  596. tnode_free(tn);
  597. *err = -ENOMEM;
  598. return oldtnode;
  599. }
  600. for(i = 0; i < olen; i++) {
  601. struct node *node = tnode_get_child(oldtnode, i);
  602. /* An empty child */
  603. if (node == NULL)
  604. continue;
  605. /* A leaf or an internal node with skipped bits */
  606. if(IS_LEAF(node) || ((struct tnode *) node)->pos >
  607. tn->pos + tn->bits - 1) {
  608. if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
  609. 1) == 0)
  610. put_child(t, tn, 2*i, node);
  611. else
  612. put_child(t, tn, 2*i+1, node);
  613. continue;
  614. }
  615. /* An internal node with two children */
  616. inode = (struct tnode *) node;
  617. if (inode->bits == 1) {
  618. put_child(t, tn, 2*i, inode->child[0]);
  619. put_child(t, tn, 2*i+1, inode->child[1]);
  620. tnode_free(inode);
  621. }
  622. /* An internal node with more than two children */
  623. else {
  624. struct tnode *left, *right;
  625. int size, j;
  626. /* We will replace this node 'inode' with two new
  627. * ones, 'left' and 'right', each with half of the
  628. * original children. The two new nodes will have
  629. * a position one bit further down the key and this
  630. * means that the "significant" part of their keys
  631. * (see the discussion near the top of this file)
  632. * will differ by one bit, which will be "0" in
  633. * left's key and "1" in right's key. Since we are
  634. * moving the key position by one step, the bit that
  635. * we are moving away from - the bit at position
  636. * (inode->pos) - is the one that will differ between
  637. * left and right. So... we synthesize that bit in the
  638. * two new keys.
  639. * The mask 'm' below will be a single "one" bit at
  640. * the position (inode->pos)
  641. */
  642. /* Use the old key, but set the new significant
  643. * bit to zero.
  644. */
  645. left = (struct tnode *) tnode_get_child(tn, 2*i);
  646. put_child(t, tn, 2*i, NULL);
  647. if(!left)
  648. BUG();
  649. right = (struct tnode *) tnode_get_child(tn, 2*i+1);
  650. put_child(t, tn, 2*i+1, NULL);
  651. if(!right)
  652. BUG();
  653. size = tnode_child_length(left);
  654. for(j = 0; j < size; j++) {
  655. put_child(t, left, j, inode->child[j]);
  656. put_child(t, right, j, inode->child[j + size]);
  657. }
  658. put_child(t, tn, 2*i, resize(t, left));
  659. put_child(t, tn, 2*i+1, resize(t, right));
  660. tnode_free(inode);
  661. }
  662. }
  663. tnode_free(oldtnode);
  664. return tn;
  665. }
  666. static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
  667. {
  668. struct tnode *oldtnode = tn;
  669. struct node *left, *right;
  670. int i;
  671. int olen = tnode_child_length(tn);
  672. if(trie_debug) printk("In halve\n");
  673. tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
  674. if (!tn) {
  675. *err = -ENOMEM;
  676. return oldtnode;
  677. }
  678. /*
  679. * Preallocate and store tnodes before the actual work so we
  680. * don't get into an inconsistent state if memory allocation
  681. * fails. In case of failure we return the oldnode and halve
  682. * of tnode is ignored.
  683. */
  684. for(i = 0; i < olen; i += 2) {
  685. left = tnode_get_child(oldtnode, i);
  686. right = tnode_get_child(oldtnode, i+1);
  687. /* Two nonempty children */
  688. if( left && right) {
  689. struct tnode *newBinNode =
  690. tnode_new(left->key, tn->pos + tn->bits, 1);
  691. if(!newBinNode) {
  692. *err = -ENOMEM;
  693. break;
  694. }
  695. put_child(t, tn, i/2, (struct node *)newBinNode);
  696. }
  697. }
  698. if(*err) {
  699. int size = tnode_child_length(tn);
  700. int j;
  701. for(j = 0; j < size; j++)
  702. if( tn->child[j])
  703. tnode_free((struct tnode *)tn->child[j]);
  704. tnode_free(tn);
  705. *err = -ENOMEM;
  706. return oldtnode;
  707. }
  708. for(i = 0; i < olen; i += 2) {
  709. left = tnode_get_child(oldtnode, i);
  710. right = tnode_get_child(oldtnode, i+1);
  711. /* At least one of the children is empty */
  712. if (left == NULL) {
  713. if (right == NULL) /* Both are empty */
  714. continue;
  715. put_child(t, tn, i/2, right);
  716. } else if (right == NULL)
  717. put_child(t, tn, i/2, left);
  718. /* Two nonempty children */
  719. else {
  720. struct tnode *newBinNode =
  721. (struct tnode *) tnode_get_child(tn, i/2);
  722. put_child(t, tn, i/2, NULL);
  723. if(!newBinNode)
  724. BUG();
  725. put_child(t, newBinNode, 0, left);
  726. put_child(t, newBinNode, 1, right);
  727. put_child(t, tn, i/2, resize(t, newBinNode));
  728. }
  729. }
  730. tnode_free(oldtnode);
  731. return tn;
  732. }
  733. static void *trie_init(struct trie *t)
  734. {
  735. if(t) {
  736. t->size = 0;
  737. t->trie = NULL;
  738. t->revision = 0;
  739. #ifdef CONFIG_IP_FIB_TRIE_STATS
  740. memset(&t->stats, 0, sizeof(struct trie_use_stats));
  741. #endif
  742. }
  743. return t;
  744. }
  745. static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
  746. {
  747. struct hlist_node *node;
  748. struct leaf_info *li;
  749. hlist_for_each_entry(li, node, head, hlist) {
  750. if ( li->plen == plen )
  751. return li;
  752. }
  753. return NULL;
  754. }
  755. static inline struct list_head * get_fa_head(struct leaf *l, int plen)
  756. {
  757. struct list_head *fa_head=NULL;
  758. struct leaf_info *li = find_leaf_info(&l->list, plen);
  759. if(li)
  760. fa_head = &li->falh;
  761. return fa_head;
  762. }
  763. static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
  764. {
  765. struct leaf_info *li=NULL, *last=NULL;
  766. struct hlist_node *node, *tmp;
  767. write_lock_bh(&fib_lock);
  768. if(hlist_empty(head))
  769. hlist_add_head(&new->hlist, head);
  770. else {
  771. hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
  772. if (new->plen > li->plen)
  773. break;
  774. last = li;
  775. }
  776. if(last)
  777. hlist_add_after(&last->hlist, &new->hlist);
  778. else
  779. hlist_add_before(&new->hlist, &li->hlist);
  780. }
  781. write_unlock_bh(&fib_lock);
  782. }
  783. static struct leaf *
  784. fib_find_node(struct trie *t, u32 key)
  785. {
  786. int pos;
  787. struct tnode *tn;
  788. struct node *n;
  789. pos = 0;
  790. n=t->trie;
  791. while (n != NULL && NODE_TYPE(n) == T_TNODE) {
  792. tn = (struct tnode *) n;
  793. check_tnode(tn);
  794. if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
  795. pos=tn->pos + tn->bits;
  796. n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
  797. }
  798. else
  799. break;
  800. }
  801. /* Case we have found a leaf. Compare prefixes */
  802. if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
  803. struct leaf *l = (struct leaf *) n;
  804. return l;
  805. }
  806. return NULL;
  807. }
  808. static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
  809. {
  810. int i = 0;
  811. int wasfull;
  812. t_key cindex, key;
  813. struct tnode *tp = NULL;
  814. if(!tn)
  815. BUG();
  816. key = tn->key;
  817. i = 0;
  818. while (tn != NULL && NODE_PARENT(tn) != NULL) {
  819. if( i > 10 ) {
  820. printk("Rebalance tn=%p \n", tn);
  821. if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
  822. printk("Rebalance tp=%p \n", tp);
  823. if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
  824. }
  825. if( i > 12 ) BUG();
  826. i++;
  827. tp = NODE_PARENT(tn);
  828. cindex = tkey_extract_bits(key, tp->pos, tp->bits);
  829. wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
  830. tn = (struct tnode *) resize (t, (struct tnode *)tn);
  831. tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
  832. if(!NODE_PARENT(tn))
  833. break;
  834. tn = NODE_PARENT(tn);
  835. }
  836. /* Handle last (top) tnode */
  837. if (IS_TNODE(tn))
  838. tn = (struct tnode*) resize(t, (struct tnode *)tn);
  839. return (struct node*) tn;
  840. }
  841. static struct list_head *
  842. fib_insert_node(struct trie *t, int *err, u32 key, int plen)
  843. {
  844. int pos, newpos;
  845. struct tnode *tp = NULL, *tn = NULL;
  846. struct node *n;
  847. struct leaf *l;
  848. int missbit;
  849. struct list_head *fa_head=NULL;
  850. struct leaf_info *li;
  851. t_key cindex;
  852. pos = 0;
  853. n=t->trie;
  854. /* If we point to NULL, stop. Either the tree is empty and we should
  855. * just put a new leaf in if, or we have reached an empty child slot,
  856. * and we should just put our new leaf in that.
  857. * If we point to a T_TNODE, check if it matches our key. Note that
  858. * a T_TNODE might be skipping any number of bits - its 'pos' need
  859. * not be the parent's 'pos'+'bits'!
  860. *
  861. * If it does match the current key, get pos/bits from it, extract
  862. * the index from our key, push the T_TNODE and walk the tree.
  863. *
  864. * If it doesn't, we have to replace it with a new T_TNODE.
  865. *
  866. * If we point to a T_LEAF, it might or might not have the same key
  867. * as we do. If it does, just change the value, update the T_LEAF's
  868. * value, and return it.
  869. * If it doesn't, we need to replace it with a T_TNODE.
  870. */
  871. while (n != NULL && NODE_TYPE(n) == T_TNODE) {
  872. tn = (struct tnode *) n;
  873. check_tnode(tn);
  874. if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
  875. tp = tn;
  876. pos=tn->pos + tn->bits;
  877. n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
  878. if(n && NODE_PARENT(n) != tn) {
  879. printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
  880. BUG();
  881. }
  882. }
  883. else
  884. break;
  885. }
  886. /*
  887. * n ----> NULL, LEAF or TNODE
  888. *
  889. * tp is n's (parent) ----> NULL or TNODE
  890. */
  891. if(tp && IS_LEAF(tp))
  892. BUG();
  893. /* Case 1: n is a leaf. Compare prefixes */
  894. if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
  895. struct leaf *l = ( struct leaf *) n;
  896. li = leaf_info_new(plen);
  897. if(! li) {
  898. *err = -ENOMEM;
  899. goto err;
  900. }
  901. fa_head = &li->falh;
  902. insert_leaf_info(&l->list, li);
  903. goto done;
  904. }
  905. t->size++;
  906. l = leaf_new();
  907. if(! l) {
  908. *err = -ENOMEM;
  909. goto err;
  910. }
  911. l->key = key;
  912. li = leaf_info_new(plen);
  913. if(! li) {
  914. tnode_free((struct tnode *) l);
  915. *err = -ENOMEM;
  916. goto err;
  917. }
  918. fa_head = &li->falh;
  919. insert_leaf_info(&l->list, li);
  920. /* Case 2: n is NULL, and will just insert a new leaf */
  921. if (t->trie && n == NULL) {
  922. NODE_SET_PARENT(l, tp);
  923. if (!tp)
  924. BUG();
  925. else {
  926. cindex = tkey_extract_bits(key, tp->pos, tp->bits);
  927. put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
  928. }
  929. }
  930. /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
  931. else {
  932. /*
  933. * Add a new tnode here
  934. * first tnode need some special handling
  935. */
  936. if (tp)
  937. pos=tp->pos+tp->bits;
  938. else
  939. pos=0;
  940. if(n) {
  941. newpos = tkey_mismatch(key, pos, n->key);
  942. tn = tnode_new(n->key, newpos, 1);
  943. }
  944. else {
  945. newpos = 0;
  946. tn = tnode_new(key, newpos, 1); /* First tnode */
  947. }
  948. if(!tn) {
  949. free_leaf_info(li);
  950. tnode_free((struct tnode *) l);
  951. *err = -ENOMEM;
  952. goto err;
  953. }
  954. NODE_SET_PARENT(tn, tp);
  955. missbit=tkey_extract_bits(key, newpos, 1);
  956. put_child(t, tn, missbit, (struct node *)l);
  957. put_child(t, tn, 1-missbit, n);
  958. if(tp) {
  959. cindex = tkey_extract_bits(key, tp->pos, tp->bits);
  960. put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
  961. }
  962. else {
  963. t->trie = (struct node*) tn; /* First tnode */
  964. tp = tn;
  965. }
  966. }
  967. if(tp && tp->pos+tp->bits > 32) {
  968. printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
  969. tp, tp->pos, tp->bits, key, plen);
  970. }
  971. /* Rebalance the trie */
  972. t->trie = trie_rebalance(t, tp);
  973. done:
  974. t->revision++;
  975. err:;
  976. return fa_head;
  977. }
  978. static int
  979. fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
  980. struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
  981. {
  982. struct trie *t = (struct trie *) tb->tb_data;
  983. struct fib_alias *fa, *new_fa;
  984. struct list_head *fa_head=NULL;
  985. struct fib_info *fi;
  986. int plen = r->rtm_dst_len;
  987. int type = r->rtm_type;
  988. u8 tos = r->rtm_tos;
  989. u32 key, mask;
  990. int err;
  991. struct leaf *l;
  992. if (plen > 32)
  993. return -EINVAL;
  994. key = 0;
  995. if (rta->rta_dst)
  996. memcpy(&key, rta->rta_dst, 4);
  997. key = ntohl(key);
  998. if(trie_debug)
  999. printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
  1000. mask = ntohl( inet_make_mask(plen) );
  1001. if(key & ~mask)
  1002. return -EINVAL;
  1003. key = key & mask;
  1004. if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
  1005. goto err;
  1006. l = fib_find_node(t, key);
  1007. fa = NULL;
  1008. if(l) {
  1009. fa_head = get_fa_head(l, plen);
  1010. fa = fib_find_alias(fa_head, tos, fi->fib_priority);
  1011. }
  1012. /* Now fa, if non-NULL, points to the first fib alias
  1013. * with the same keys [prefix,tos,priority], if such key already
  1014. * exists or to the node before which we will insert new one.
  1015. *
  1016. * If fa is NULL, we will need to allocate a new one and
  1017. * insert to the head of f.
  1018. *
  1019. * If f is NULL, no fib node matched the destination key
  1020. * and we need to allocate a new one of those as well.
  1021. */
  1022. if (fa &&
  1023. fa->fa_info->fib_priority == fi->fib_priority) {
  1024. struct fib_alias *fa_orig;
  1025. err = -EEXIST;
  1026. if (nlhdr->nlmsg_flags & NLM_F_EXCL)
  1027. goto out;
  1028. if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
  1029. struct fib_info *fi_drop;
  1030. u8 state;
  1031. write_lock_bh(&fib_lock);
  1032. fi_drop = fa->fa_info;
  1033. fa->fa_info = fi;
  1034. fa->fa_type = type;
  1035. fa->fa_scope = r->rtm_scope;
  1036. state = fa->fa_state;
  1037. fa->fa_state &= ~FA_S_ACCESSED;
  1038. write_unlock_bh(&fib_lock);
  1039. fib_release_info(fi_drop);
  1040. if (state & FA_S_ACCESSED)
  1041. rt_cache_flush(-1);
  1042. goto succeeded;
  1043. }
  1044. /* Error if we find a perfect match which
  1045. * uses the same scope, type, and nexthop
  1046. * information.
  1047. */
  1048. fa_orig = fa;
  1049. list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
  1050. if (fa->fa_tos != tos)
  1051. break;
  1052. if (fa->fa_info->fib_priority != fi->fib_priority)
  1053. break;
  1054. if (fa->fa_type == type &&
  1055. fa->fa_scope == r->rtm_scope &&
  1056. fa->fa_info == fi) {
  1057. goto out;
  1058. }
  1059. }
  1060. if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
  1061. fa = fa_orig;
  1062. }
  1063. err = -ENOENT;
  1064. if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
  1065. goto out;
  1066. err = -ENOBUFS;
  1067. new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
  1068. if (new_fa == NULL)
  1069. goto out;
  1070. new_fa->fa_info = fi;
  1071. new_fa->fa_tos = tos;
  1072. new_fa->fa_type = type;
  1073. new_fa->fa_scope = r->rtm_scope;
  1074. new_fa->fa_state = 0;
  1075. #if 0
  1076. new_fa->dst = NULL;
  1077. #endif
  1078. /*
  1079. * Insert new entry to the list.
  1080. */
  1081. if(!fa_head) {
  1082. fa_head = fib_insert_node(t, &err, key, plen);
  1083. err = 0;
  1084. if(err)
  1085. goto out_free_new_fa;
  1086. }
  1087. write_lock_bh(&fib_lock);
  1088. list_add_tail(&new_fa->fa_list,
  1089. (fa ? &fa->fa_list : fa_head));
  1090. write_unlock_bh(&fib_lock);
  1091. rt_cache_flush(-1);
  1092. rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
  1093. succeeded:
  1094. return 0;
  1095. out_free_new_fa:
  1096. kmem_cache_free(fn_alias_kmem, new_fa);
  1097. out:
  1098. fib_release_info(fi);
  1099. err:;
  1100. return err;
  1101. }
  1102. static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
  1103. struct fib_result *res, int *err)
  1104. {
  1105. int i;
  1106. t_key mask;
  1107. struct leaf_info *li;
  1108. struct hlist_head *hhead = &l->list;
  1109. struct hlist_node *node;
  1110. hlist_for_each_entry(li, node, hhead, hlist) {
  1111. i = li->plen;
  1112. mask = ntohl(inet_make_mask(i));
  1113. if (l->key != (key & mask))
  1114. continue;
  1115. if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
  1116. *plen = i;
  1117. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1118. t->stats.semantic_match_passed++;
  1119. #endif
  1120. return 1;
  1121. }
  1122. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1123. t->stats.semantic_match_miss++;
  1124. #endif
  1125. }
  1126. return 0;
  1127. }
  1128. static int
  1129. fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
  1130. {
  1131. struct trie *t = (struct trie *) tb->tb_data;
  1132. int plen, ret = 0;
  1133. struct node *n;
  1134. struct tnode *pn;
  1135. int pos, bits;
  1136. t_key key=ntohl(flp->fl4_dst);
  1137. int chopped_off;
  1138. t_key cindex = 0;
  1139. int current_prefix_length = KEYLENGTH;
  1140. n = t->trie;
  1141. read_lock(&fib_lock);
  1142. if(!n)
  1143. goto failed;
  1144. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1145. t->stats.gets++;
  1146. #endif
  1147. /* Just a leaf? */
  1148. if (IS_LEAF(n)) {
  1149. if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
  1150. goto found;
  1151. goto failed;
  1152. }
  1153. pn = (struct tnode *) n;
  1154. chopped_off = 0;
  1155. while (pn) {
  1156. pos = pn->pos;
  1157. bits = pn->bits;
  1158. if(!chopped_off)
  1159. cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
  1160. n = tnode_get_child(pn, cindex);
  1161. if (n == NULL) {
  1162. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1163. t->stats.null_node_hit++;
  1164. #endif
  1165. goto backtrace;
  1166. }
  1167. if (IS_TNODE(n)) {
  1168. #define HL_OPTIMIZE
  1169. #ifdef HL_OPTIMIZE
  1170. struct tnode *cn = (struct tnode *)n;
  1171. t_key node_prefix, key_prefix, pref_mismatch;
  1172. int mp;
  1173. /*
  1174. * It's a tnode, and we can do some extra checks here if we
  1175. * like, to avoid descending into a dead-end branch.
  1176. * This tnode is in the parent's child array at index
  1177. * key[p_pos..p_pos+p_bits] but potentially with some bits
  1178. * chopped off, so in reality the index may be just a
  1179. * subprefix, padded with zero at the end.
  1180. * We can also take a look at any skipped bits in this
  1181. * tnode - everything up to p_pos is supposed to be ok,
  1182. * and the non-chopped bits of the index (se previous
  1183. * paragraph) are also guaranteed ok, but the rest is
  1184. * considered unknown.
  1185. *
  1186. * The skipped bits are key[pos+bits..cn->pos].
  1187. */
  1188. /* If current_prefix_length < pos+bits, we are already doing
  1189. * actual prefix matching, which means everything from
  1190. * pos+(bits-chopped_off) onward must be zero along some
  1191. * branch of this subtree - otherwise there is *no* valid
  1192. * prefix present. Here we can only check the skipped
  1193. * bits. Remember, since we have already indexed into the
  1194. * parent's child array, we know that the bits we chopped of
  1195. * *are* zero.
  1196. */
  1197. /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
  1198. if (current_prefix_length < pos+bits) {
  1199. if (tkey_extract_bits(cn->key, current_prefix_length,
  1200. cn->pos - current_prefix_length) != 0 ||
  1201. !(cn->child[0]))
  1202. goto backtrace;
  1203. }
  1204. /*
  1205. * If chopped_off=0, the index is fully validated and we
  1206. * only need to look at the skipped bits for this, the new,
  1207. * tnode. What we actually want to do is to find out if
  1208. * these skipped bits match our key perfectly, or if we will
  1209. * have to count on finding a matching prefix further down,
  1210. * because if we do, we would like to have some way of
  1211. * verifying the existence of such a prefix at this point.
  1212. */
  1213. /* The only thing we can do at this point is to verify that
  1214. * any such matching prefix can indeed be a prefix to our
  1215. * key, and if the bits in the node we are inspecting that
  1216. * do not match our key are not ZERO, this cannot be true.
  1217. * Thus, find out where there is a mismatch (before cn->pos)
  1218. * and verify that all the mismatching bits are zero in the
  1219. * new tnode's key.
  1220. */
  1221. /* Note: We aren't very concerned about the piece of the key
  1222. * that precede pn->pos+pn->bits, since these have already been
  1223. * checked. The bits after cn->pos aren't checked since these are
  1224. * by definition "unknown" at this point. Thus, what we want to
  1225. * see is if we are about to enter the "prefix matching" state,
  1226. * and in that case verify that the skipped bits that will prevail
  1227. * throughout this subtree are zero, as they have to be if we are
  1228. * to find a matching prefix.
  1229. */
  1230. node_prefix = MASK_PFX(cn->key, cn->pos);
  1231. key_prefix = MASK_PFX(key, cn->pos);
  1232. pref_mismatch = key_prefix^node_prefix;
  1233. mp = 0;
  1234. /* In short: If skipped bits in this node do not match the search
  1235. * key, enter the "prefix matching" state.directly.
  1236. */
  1237. if (pref_mismatch) {
  1238. while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
  1239. mp++;
  1240. pref_mismatch = pref_mismatch <<1;
  1241. }
  1242. key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
  1243. if (key_prefix != 0)
  1244. goto backtrace;
  1245. if (current_prefix_length >= cn->pos)
  1246. current_prefix_length=mp;
  1247. }
  1248. #endif
  1249. pn = (struct tnode *)n; /* Descend */
  1250. chopped_off = 0;
  1251. continue;
  1252. }
  1253. if (IS_LEAF(n)) {
  1254. if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
  1255. goto found;
  1256. }
  1257. backtrace:
  1258. chopped_off++;
  1259. /* As zero don't change the child key (cindex) */
  1260. while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
  1261. chopped_off++;
  1262. }
  1263. /* Decrease current_... with bits chopped off */
  1264. if (current_prefix_length > pn->pos + pn->bits - chopped_off)
  1265. current_prefix_length = pn->pos + pn->bits - chopped_off;
  1266. /*
  1267. * Either we do the actual chop off according or if we have
  1268. * chopped off all bits in this tnode walk up to our parent.
  1269. */
  1270. if(chopped_off <= pn->bits)
  1271. cindex &= ~(1 << (chopped_off-1));
  1272. else {
  1273. if( NODE_PARENT(pn) == NULL)
  1274. goto failed;
  1275. /* Get Child's index */
  1276. cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
  1277. pn = NODE_PARENT(pn);
  1278. chopped_off = 0;
  1279. #ifdef CONFIG_IP_FIB_TRIE_STATS
  1280. t->stats.backtrack++;
  1281. #endif
  1282. goto backtrace;
  1283. }
  1284. }
  1285. failed:
  1286. ret = 1;
  1287. found:
  1288. read_unlock(&fib_lock);
  1289. return ret;
  1290. }
  1291. static int trie_leaf_remove(struct trie *t, t_key key)
  1292. {
  1293. t_key cindex;
  1294. struct tnode *tp = NULL;
  1295. struct node *n = t->trie;
  1296. struct leaf *l;
  1297. if(trie_debug)
  1298. printk("entering trie_leaf_remove(%p)\n", n);
  1299. /* Note that in the case skipped bits, those bits are *not* checked!
  1300. * When we finish this, we will have NULL or a T_LEAF, and the
  1301. * T_LEAF may or may not match our key.
  1302. */
  1303. while (n != NULL && IS_TNODE(n)) {
  1304. struct tnode *tn = (struct tnode *) n;
  1305. check_tnode(tn);
  1306. n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
  1307. if(n && NODE_PARENT(n) != tn) {
  1308. printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
  1309. BUG();
  1310. }
  1311. }
  1312. l = (struct leaf *) n;
  1313. if(!n || !tkey_equals(l->key, key))
  1314. return 0;
  1315. /*
  1316. * Key found.
  1317. * Remove the leaf and rebalance the tree
  1318. */
  1319. t->revision++;
  1320. t->size--;
  1321. tp = NODE_PARENT(n);
  1322. tnode_free((struct tnode *) n);
  1323. if(tp) {
  1324. cindex = tkey_extract_bits(key, tp->pos, tp->bits);
  1325. put_child(t, (struct tnode *)tp, cindex, NULL);
  1326. t->trie = trie_rebalance(t, tp);
  1327. }
  1328. else
  1329. t->trie = NULL;
  1330. return 1;
  1331. }
  1332. static int
  1333. fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
  1334. struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
  1335. {
  1336. struct trie *t = (struct trie *) tb->tb_data;
  1337. u32 key, mask;
  1338. int plen = r->rtm_dst_len;
  1339. u8 tos = r->rtm_tos;
  1340. struct fib_alias *fa, *fa_to_delete;
  1341. struct list_head *fa_head;
  1342. struct leaf *l;
  1343. if (plen > 32)
  1344. return -EINVAL;
  1345. key = 0;
  1346. if (rta->rta_dst)
  1347. memcpy(&key, rta->rta_dst, 4);
  1348. key = ntohl(key);
  1349. mask = ntohl( inet_make_mask(plen) );
  1350. if(key & ~mask)
  1351. return -EINVAL;
  1352. key = key & mask;
  1353. l = fib_find_node(t, key);
  1354. if(!l)
  1355. return -ESRCH;
  1356. fa_head = get_fa_head(l, plen);
  1357. fa = fib_find_alias(fa_head, tos, 0);
  1358. if (!fa)
  1359. return -ESRCH;
  1360. if (trie_debug)
  1361. printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
  1362. fa_to_delete = NULL;
  1363. fa_head = fa->fa_list.prev;
  1364. list_for_each_entry(fa, fa_head, fa_list) {
  1365. struct fib_info *fi = fa->fa_info;
  1366. if (fa->fa_tos != tos)
  1367. break;
  1368. if ((!r->rtm_type ||
  1369. fa->fa_type == r->rtm_type) &&
  1370. (r->rtm_scope == RT_SCOPE_NOWHERE ||
  1371. fa->fa_scope == r->rtm_scope) &&
  1372. (!r->rtm_protocol ||
  1373. fi->fib_protocol == r->rtm_protocol) &&
  1374. fib_nh_match(r, nlhdr, rta, fi) == 0) {
  1375. fa_to_delete = fa;
  1376. break;
  1377. }
  1378. }
  1379. if (fa_to_delete) {
  1380. int kill_li = 0;
  1381. struct leaf_info *li;
  1382. fa = fa_to_delete;
  1383. rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
  1384. l = fib_find_node(t, key);
  1385. li = find_leaf_info(&l->list, plen);
  1386. write_lock_bh(&fib_lock);
  1387. list_del(&fa->fa_list);
  1388. if(list_empty(fa_head)) {
  1389. hlist_del(&li->hlist);
  1390. kill_li = 1;
  1391. }
  1392. write_unlock_bh(&fib_lock);
  1393. if(kill_li)
  1394. free_leaf_info(li);
  1395. if(hlist_empty(&l->list))
  1396. trie_leaf_remove(t, key);
  1397. if (fa->fa_state & FA_S_ACCESSED)
  1398. rt_cache_flush(-1);
  1399. fn_free_alias(fa);
  1400. return 0;
  1401. }
  1402. return -ESRCH;
  1403. }
  1404. static int trie_flush_list(struct trie *t, struct list_head *head)
  1405. {
  1406. struct fib_alias *fa, *fa_node;
  1407. int found = 0;
  1408. list_for_each_entry_safe(fa, fa_node, head, fa_list) {
  1409. struct fib_info *fi = fa->fa_info;
  1410. if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
  1411. write_lock_bh(&fib_lock);
  1412. list_del(&fa->fa_list);
  1413. write_unlock_bh(&fib_lock);
  1414. fn_free_alias(fa);
  1415. found++;
  1416. }
  1417. }
  1418. return found;
  1419. }
  1420. static int trie_flush_leaf(struct trie *t, struct leaf *l)
  1421. {
  1422. int found = 0;
  1423. struct hlist_head *lih = &l->list;
  1424. struct hlist_node *node, *tmp;
  1425. struct leaf_info *li = NULL;
  1426. hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
  1427. found += trie_flush_list(t, &li->falh);
  1428. if (list_empty(&li->falh)) {
  1429. write_lock_bh(&fib_lock);
  1430. hlist_del(&li->hlist);
  1431. write_unlock_bh(&fib_lock);
  1432. free_leaf_info(li);
  1433. }
  1434. }
  1435. return found;
  1436. }
  1437. static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
  1438. {
  1439. struct node *c = (struct node *) thisleaf;
  1440. struct tnode *p;
  1441. int idx;
  1442. if(c == NULL) {
  1443. if(t->trie == NULL)
  1444. return NULL;
  1445. if (IS_LEAF(t->trie)) /* trie w. just a leaf */
  1446. return (struct leaf *) t->trie;
  1447. p = (struct tnode*) t->trie; /* Start */
  1448. }
  1449. else
  1450. p = (struct tnode *) NODE_PARENT(c);
  1451. while (p) {
  1452. int pos, last;
  1453. /* Find the next child of the parent */
  1454. if(c)
  1455. pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
  1456. else
  1457. pos = 0;
  1458. last = 1 << p->bits;
  1459. for(idx = pos; idx < last ; idx++) {
  1460. if( p->child[idx]) {
  1461. /* Decend if tnode */
  1462. while (IS_TNODE(p->child[idx])) {
  1463. p = (struct tnode*) p->child[idx];
  1464. idx = 0;
  1465. /* Rightmost non-NULL branch */
  1466. if( p && IS_TNODE(p) )
  1467. while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
  1468. /* Done with this tnode? */
  1469. if( idx >= (1 << p->bits) || p->child[idx] == NULL )
  1470. goto up;
  1471. }
  1472. return (struct leaf*) p->child[idx];
  1473. }
  1474. }
  1475. up:
  1476. /* No more children go up one step */
  1477. c = (struct node*) p;
  1478. p = (struct tnode *) NODE_PARENT(p);
  1479. }
  1480. return NULL; /* Ready. Root of trie */
  1481. }
  1482. static int fn_trie_flush(struct fib_table *tb)
  1483. {
  1484. struct trie *t = (struct trie *) tb->tb_data;
  1485. struct leaf *ll = NULL, *l = NULL;
  1486. int found = 0, h;
  1487. t->revision++;
  1488. for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
  1489. found += trie_flush_leaf(t, l);
  1490. if (ll && hlist_empty(&ll->list))
  1491. trie_leaf_remove(t, ll->key);
  1492. ll = l;
  1493. }
  1494. if (ll && hlist_empty(&ll->list))
  1495. trie_leaf_remove(t, ll->key);
  1496. if(trie_debug)
  1497. printk("trie_flush found=%d\n", found);
  1498. return found;
  1499. }
  1500. static int trie_last_dflt=-1;
  1501. static void
  1502. fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
  1503. {
  1504. struct trie *t = (struct trie *) tb->tb_data;
  1505. int order, last_idx;
  1506. struct fib_info *fi = NULL;
  1507. struct fib_info *last_resort;
  1508. struct fib_alias *fa = NULL;
  1509. struct list_head *fa_head;
  1510. struct leaf *l;
  1511. last_idx = -1;
  1512. last_resort = NULL;
  1513. order = -1;
  1514. read_lock(&fib_lock);
  1515. l = fib_find_node(t, 0);
  1516. if(!l)
  1517. goto out;
  1518. fa_head = get_fa_head(l, 0);
  1519. if(!fa_head)
  1520. goto out;
  1521. if (list_empty(fa_head))
  1522. goto out;
  1523. list_for_each_entry(fa, fa_head, fa_list) {
  1524. struct fib_info *next_fi = fa->fa_info;
  1525. if (fa->fa_scope != res->scope ||
  1526. fa->fa_type != RTN_UNICAST)
  1527. continue;
  1528. if (next_fi->fib_priority > res->fi->fib_priority)
  1529. break;
  1530. if (!next_fi->fib_nh[0].nh_gw ||
  1531. next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
  1532. continue;
  1533. fa->fa_state |= FA_S_ACCESSED;
  1534. if (fi == NULL) {
  1535. if (next_fi != res->fi)
  1536. break;
  1537. } else if (!fib_detect_death(fi, order, &last_resort,
  1538. &last_idx, &trie_last_dflt)) {
  1539. if (res->fi)
  1540. fib_info_put(res->fi);
  1541. res->fi = fi;
  1542. atomic_inc(&fi->fib_clntref);
  1543. trie_last_dflt = order;
  1544. goto out;
  1545. }
  1546. fi = next_fi;
  1547. order++;
  1548. }
  1549. if (order <= 0 || fi == NULL) {
  1550. trie_last_dflt = -1;
  1551. goto out;
  1552. }
  1553. if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
  1554. if (res->fi)
  1555. fib_info_put(res->fi);
  1556. res->fi = fi;
  1557. atomic_inc(&fi->fib_clntref);
  1558. trie_last_dflt = order;
  1559. goto out;
  1560. }
  1561. if (last_idx >= 0) {
  1562. if (res->fi)
  1563. fib_info_put(res->fi);
  1564. res->fi = last_resort;
  1565. if (last_resort)
  1566. atomic_inc(&last_resort->fib_clntref);
  1567. }
  1568. trie_last_dflt = last_idx;
  1569. out:;
  1570. read_unlock(&fib_lock);
  1571. }
  1572. static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
  1573. struct sk_buff *skb, struct netlink_callback *cb)
  1574. {
  1575. int i, s_i;
  1576. struct fib_alias *fa;
  1577. u32 xkey=htonl(key);
  1578. s_i=cb->args[3];
  1579. i = 0;
  1580. list_for_each_entry(fa, fah, fa_list) {
  1581. if (i < s_i) {
  1582. i++;
  1583. continue;
  1584. }
  1585. if (fa->fa_info->fib_nh == NULL) {
  1586. printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
  1587. i++;
  1588. continue;
  1589. }
  1590. if (fa->fa_info == NULL) {
  1591. printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
  1592. i++;
  1593. continue;
  1594. }
  1595. if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
  1596. cb->nlh->nlmsg_seq,
  1597. RTM_NEWROUTE,
  1598. tb->tb_id,
  1599. fa->fa_type,
  1600. fa->fa_scope,
  1601. &xkey,
  1602. plen,
  1603. fa->fa_tos,
  1604. fa->fa_info, 0) < 0) {
  1605. cb->args[3] = i;
  1606. return -1;
  1607. }
  1608. i++;
  1609. }
  1610. cb->args[3]=i;
  1611. return skb->len;
  1612. }
  1613. static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
  1614. struct netlink_callback *cb)
  1615. {
  1616. int h, s_h;
  1617. struct list_head *fa_head;
  1618. struct leaf *l = NULL;
  1619. s_h=cb->args[2];
  1620. for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
  1621. if (h < s_h)
  1622. continue;
  1623. if (h > s_h)
  1624. memset(&cb->args[3], 0,
  1625. sizeof(cb->args) - 3*sizeof(cb->args[0]));
  1626. fa_head = get_fa_head(l, plen);
  1627. if(!fa_head)
  1628. continue;
  1629. if(list_empty(fa_head))
  1630. continue;
  1631. if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
  1632. cb->args[2]=h;
  1633. return -1;
  1634. }
  1635. }
  1636. cb->args[2]=h;
  1637. return skb->len;
  1638. }
  1639. static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
  1640. {
  1641. int m, s_m;
  1642. struct trie *t = (struct trie *) tb->tb_data;
  1643. s_m = cb->args[1];
  1644. read_lock(&fib_lock);
  1645. for (m=0; m<=32; m++) {
  1646. if (m < s_m)
  1647. continue;
  1648. if (m > s_m)
  1649. memset(&cb->args[2], 0,
  1650. sizeof(cb->args) - 2*sizeof(cb->args[0]));
  1651. if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
  1652. cb->args[1] = m;
  1653. goto out;
  1654. }
  1655. }
  1656. read_unlock(&fib_lock);
  1657. cb->args[1] = m;
  1658. return skb->len;
  1659. out:
  1660. read_unlock(&fib_lock);
  1661. return -1;
  1662. }
  1663. /* Fix more generic FIB names for init later */
  1664. #ifdef CONFIG_IP_MULTIPLE_TABLES
  1665. struct fib_table * fib_hash_init(int id)
  1666. #else
  1667. struct fib_table * __init fib_hash_init(int id)
  1668. #endif
  1669. {
  1670. struct fib_table *tb;
  1671. struct trie *t;
  1672. if (fn_alias_kmem == NULL)
  1673. fn_alias_kmem = kmem_cache_create("ip_fib_alias",
  1674. sizeof(struct fib_alias),
  1675. 0, SLAB_HWCACHE_ALIGN,
  1676. NULL, NULL);
  1677. tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
  1678. GFP_KERNEL);
  1679. if (tb == NULL)
  1680. return NULL;
  1681. tb->tb_id = id;
  1682. tb->tb_lookup = fn_trie_lookup;
  1683. tb->tb_insert = fn_trie_insert;
  1684. tb->tb_delete = fn_trie_delete;
  1685. tb->tb_flush = fn_trie_flush;
  1686. tb->tb_select_default = fn_trie_select_default;
  1687. tb->tb_dump = fn_trie_dump;
  1688. memset(tb->tb_data, 0, sizeof(struct trie));
  1689. t = (struct trie *) tb->tb_data;
  1690. trie_init(t);
  1691. if (id == RT_TABLE_LOCAL)
  1692. trie_local=t;
  1693. else if (id == RT_TABLE_MAIN)
  1694. trie_main=t;
  1695. if (id == RT_TABLE_LOCAL)
  1696. printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
  1697. return tb;
  1698. }
  1699. /* Trie dump functions */
  1700. static void putspace_seq(struct seq_file *seq, int n)
  1701. {
  1702. while (n--) seq_printf(seq, " ");
  1703. }
  1704. static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
  1705. {
  1706. while (bits--)
  1707. seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
  1708. }
  1709. static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
  1710. int pend, int cindex, int bits)
  1711. {
  1712. putspace_seq(seq, indent);
  1713. if (IS_LEAF(n))
  1714. seq_printf(seq, "|");
  1715. else
  1716. seq_printf(seq, "+");
  1717. if (bits) {
  1718. seq_printf(seq, "%d/", cindex);
  1719. printbin_seq(seq, cindex, bits);
  1720. seq_printf(seq, ": ");
  1721. }
  1722. else
  1723. seq_printf(seq, "<root>: ");
  1724. seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
  1725. if (IS_LEAF(n))
  1726. seq_printf(seq, "key=%d.%d.%d.%d\n",
  1727. n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
  1728. else {
  1729. int plen=((struct tnode *)n)->pos;
  1730. t_key prf=MASK_PFX(n->key, plen);
  1731. seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
  1732. prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
  1733. }
  1734. if (IS_LEAF(n)) {
  1735. struct leaf *l=(struct leaf *)n;
  1736. struct fib_alias *fa;
  1737. int i;
  1738. for (i=32; i>=0; i--)
  1739. if(find_leaf_info(&l->list, i)) {
  1740. struct list_head *fa_head = get_fa_head(l, i);
  1741. if(!fa_head)
  1742. continue;
  1743. if(list_empty(fa_head))
  1744. continue;
  1745. putspace_seq(seq, indent+2);
  1746. seq_printf(seq, "{/%d...dumping}\n", i);
  1747. list_for_each_entry(fa, fa_head, fa_list) {
  1748. putspace_seq(seq, indent+2);
  1749. if (fa->fa_info->fib_nh == NULL) {
  1750. seq_printf(seq, "Error _fib_nh=NULL\n");
  1751. continue;
  1752. }
  1753. if (fa->fa_info == NULL) {
  1754. seq_printf(seq, "Error fa_info=NULL\n");
  1755. continue;
  1756. }
  1757. seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
  1758. fa->fa_type,
  1759. fa->fa_scope,
  1760. fa->fa_tos);
  1761. }
  1762. }
  1763. }
  1764. else if (IS_TNODE(n)) {
  1765. struct tnode *tn=(struct tnode *)n;
  1766. putspace_seq(seq, indent); seq_printf(seq, "| ");
  1767. seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
  1768. printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
  1769. seq_printf(seq, "}\n");
  1770. putspace_seq(seq, indent); seq_printf(seq, "| ");
  1771. seq_printf(seq, "{pos=%d", tn->pos);
  1772. seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
  1773. seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
  1774. putspace_seq(seq, indent); seq_printf(seq, "| ");
  1775. seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
  1776. }
  1777. }
  1778. static void trie_dump_seq(struct seq_file *seq, struct trie *t)
  1779. {
  1780. struct node *n=t->trie;
  1781. int cindex=0;
  1782. int indent=1;
  1783. int pend=0;
  1784. int depth = 0;
  1785. read_lock(&fib_lock);
  1786. seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
  1787. if (n) {
  1788. printnode_seq(seq, indent, n, pend, cindex, 0);
  1789. if (IS_TNODE(n)) {
  1790. struct tnode *tn=(struct tnode *)n;
  1791. pend = tn->pos+tn->bits;
  1792. putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
  1793. indent += 3;
  1794. depth++;
  1795. while (tn && cindex < (1 << tn->bits)) {
  1796. if (tn->child[cindex]) {
  1797. /* Got a child */
  1798. printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
  1799. if (IS_LEAF(tn->child[cindex])) {
  1800. cindex++;
  1801. }
  1802. else {
  1803. /*
  1804. * New tnode. Decend one level
  1805. */
  1806. depth++;
  1807. n=tn->child[cindex];
  1808. tn=(struct tnode *)n;
  1809. pend=tn->pos+tn->bits;
  1810. putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
  1811. indent+=3;
  1812. cindex=0;
  1813. }
  1814. }
  1815. else
  1816. cindex++;
  1817. /*
  1818. * Test if we are done
  1819. */
  1820. while (cindex >= (1 << tn->bits)) {
  1821. /*
  1822. * Move upwards and test for root
  1823. * pop off all traversed nodes
  1824. */
  1825. if (NODE_PARENT(tn) == NULL) {
  1826. tn = NULL;
  1827. n = NULL;
  1828. break;
  1829. }
  1830. else {
  1831. cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
  1832. tn = NODE_PARENT(tn);
  1833. cindex++;
  1834. n=(struct node *)tn;
  1835. pend=tn->pos+tn->bits;
  1836. indent-=3;
  1837. depth--;
  1838. }
  1839. }
  1840. }
  1841. }
  1842. else n = NULL;
  1843. }
  1844. else seq_printf(seq, "------ trie is empty\n");
  1845. read_unlock(&fib_lock);
  1846. }
  1847. static struct trie_stat *trie_stat_new(void)
  1848. {
  1849. struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
  1850. int i;
  1851. if(s) {
  1852. s->totdepth = 0;
  1853. s->maxdepth = 0;
  1854. s->tnodes = 0;
  1855. s->leaves = 0;
  1856. s->nullpointers = 0;
  1857. for(i=0; i< MAX_CHILDS; i++)
  1858. s->nodesizes[i] = 0;
  1859. }
  1860. return s;
  1861. }
  1862. static struct trie_stat *trie_collect_stats(struct trie *t)
  1863. {
  1864. struct node *n=t->trie;
  1865. struct trie_stat *s = trie_stat_new();
  1866. int cindex = 0;
  1867. int indent = 1;
  1868. int pend = 0;
  1869. int depth = 0;
  1870. read_lock(&fib_lock);
  1871. if (s) {
  1872. if (n) {
  1873. if (IS_TNODE(n)) {
  1874. struct tnode *tn = (struct tnode *)n;
  1875. pend=tn->pos+tn->bits;
  1876. indent += 3;
  1877. s->nodesizes[tn->bits]++;
  1878. depth++;
  1879. while (tn && cindex < (1 << tn->bits)) {
  1880. if (tn->child[cindex]) {
  1881. /* Got a child */
  1882. if (IS_LEAF(tn->child[cindex])) {
  1883. cindex++;
  1884. /* stats */
  1885. if (depth > s->maxdepth)
  1886. s->maxdepth = depth;
  1887. s->totdepth += depth;
  1888. s->leaves++;
  1889. }
  1890. else {
  1891. /*
  1892. * New tnode. Decend one level
  1893. */
  1894. s->tnodes++;
  1895. s->nodesizes[tn->bits]++;
  1896. depth++;
  1897. n = tn->child[cindex];
  1898. tn = (struct tnode *)n;
  1899. pend = tn->pos+tn->bits;
  1900. indent += 3;
  1901. cindex = 0;
  1902. }
  1903. }
  1904. else {
  1905. cindex++;
  1906. s->nullpointers++;
  1907. }
  1908. /*
  1909. * Test if we are done
  1910. */
  1911. while (cindex >= (1 << tn->bits)) {
  1912. /*
  1913. * Move upwards and test for root
  1914. * pop off all traversed nodes
  1915. */
  1916. if (NODE_PARENT(tn) == NULL) {
  1917. tn = NULL;
  1918. n = NULL;
  1919. break;
  1920. }
  1921. else {
  1922. cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
  1923. tn = NODE_PARENT(tn);
  1924. cindex++;
  1925. n = (struct node *)tn;
  1926. pend=tn->pos+tn->bits;
  1927. indent -= 3;
  1928. depth--;
  1929. }
  1930. }
  1931. }
  1932. }
  1933. else n = NULL;
  1934. }
  1935. }
  1936. read_unlock(&fib_lock);
  1937. return s;
  1938. }
  1939. #ifdef CONFIG_PROC_FS
  1940. static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
  1941. {
  1942. return NULL;
  1943. }
  1944. static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
  1945. {
  1946. return NULL;
  1947. }
  1948. static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
  1949. {
  1950. void *v = NULL;
  1951. if (ip_fib_main_table)
  1952. v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
  1953. return v;
  1954. }
  1955. static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  1956. {
  1957. ++*pos;
  1958. return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
  1959. }
  1960. static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
  1961. {
  1962. }
  1963. /*
  1964. * This outputs /proc/net/fib_triestats
  1965. *
  1966. * It always works in backward compatibility mode.
  1967. * The format of the file is not supposed to be changed.
  1968. */
  1969. static void collect_and_show(struct trie *t, struct seq_file *seq)
  1970. {
  1971. int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
  1972. int i, max, pointers;
  1973. struct trie_stat *stat;
  1974. int avdepth;
  1975. stat = trie_collect_stats(t);
  1976. bytes=0;
  1977. seq_printf(seq, "trie=%p\n", t);
  1978. if (stat) {
  1979. if (stat->leaves)
  1980. avdepth=stat->totdepth*100 / stat->leaves;
  1981. else
  1982. avdepth=0;
  1983. seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
  1984. seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
  1985. seq_printf(seq, "Leaves: %d\n", stat->leaves);
  1986. bytes += sizeof(struct leaf) * stat->leaves;
  1987. seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
  1988. bytes += sizeof(struct tnode) * stat->tnodes;
  1989. max = MAX_CHILDS-1;
  1990. while (max >= 0 && stat->nodesizes[max] == 0)
  1991. max--;
  1992. pointers = 0;
  1993. for (i = 1; i <= max; i++)
  1994. if (stat->nodesizes[i] != 0) {
  1995. seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
  1996. pointers += (1<<i) * stat->nodesizes[i];
  1997. }
  1998. seq_printf(seq, "\n");
  1999. seq_printf(seq, "Pointers: %d\n", pointers);
  2000. bytes += sizeof(struct node *) * pointers;
  2001. seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
  2002. seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
  2003. kfree(stat);
  2004. }
  2005. #ifdef CONFIG_IP_FIB_TRIE_STATS
  2006. seq_printf(seq, "Counters:\n---------\n");
  2007. seq_printf(seq,"gets = %d\n", t->stats.gets);
  2008. seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
  2009. seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
  2010. seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
  2011. seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
  2012. seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
  2013. #ifdef CLEAR_STATS
  2014. memset(&(t->stats), 0, sizeof(t->stats));
  2015. #endif
  2016. #endif /* CONFIG_IP_FIB_TRIE_STATS */
  2017. }
  2018. static int fib_triestat_seq_show(struct seq_file *seq, void *v)
  2019. {
  2020. char bf[128];
  2021. if (v == SEQ_START_TOKEN) {
  2022. seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
  2023. sizeof(struct leaf), sizeof(struct tnode));
  2024. if (trie_local)
  2025. collect_and_show(trie_local, seq);
  2026. if (trie_main)
  2027. collect_and_show(trie_main, seq);
  2028. }
  2029. else {
  2030. snprintf(bf, sizeof(bf),
  2031. "*\t%08X\t%08X", 200, 400);
  2032. seq_printf(seq, "%-127s\n", bf);
  2033. }
  2034. return 0;
  2035. }
  2036. static struct seq_operations fib_triestat_seq_ops = {
  2037. .start = fib_triestat_seq_start,
  2038. .next = fib_triestat_seq_next,
  2039. .stop = fib_triestat_seq_stop,
  2040. .show = fib_triestat_seq_show,
  2041. };
  2042. static int fib_triestat_seq_open(struct inode *inode, struct file *file)
  2043. {
  2044. struct seq_file *seq;
  2045. int rc = -ENOMEM;
  2046. rc = seq_open(file, &fib_triestat_seq_ops);
  2047. if (rc)
  2048. goto out_kfree;
  2049. seq = file->private_data;
  2050. out:
  2051. return rc;
  2052. out_kfree:
  2053. goto out;
  2054. }
  2055. static struct file_operations fib_triestat_seq_fops = {
  2056. .owner = THIS_MODULE,
  2057. .open = fib_triestat_seq_open,
  2058. .read = seq_read,
  2059. .llseek = seq_lseek,
  2060. .release = seq_release_private,
  2061. };
  2062. int __init fib_stat_proc_init(void)
  2063. {
  2064. if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
  2065. return -ENOMEM;
  2066. return 0;
  2067. }
  2068. void __init fib_stat_proc_exit(void)
  2069. {
  2070. proc_net_remove("fib_triestat");
  2071. }
  2072. static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
  2073. {
  2074. return NULL;
  2075. }
  2076. static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
  2077. {
  2078. return NULL;
  2079. }
  2080. static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
  2081. {
  2082. void *v = NULL;
  2083. if (ip_fib_main_table)
  2084. v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
  2085. return v;
  2086. }
  2087. static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  2088. {
  2089. ++*pos;
  2090. return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
  2091. }
  2092. static void fib_trie_seq_stop(struct seq_file *seq, void *v)
  2093. {
  2094. }
  2095. /*
  2096. * This outputs /proc/net/fib_trie.
  2097. *
  2098. * It always works in backward compatibility mode.
  2099. * The format of the file is not supposed to be changed.
  2100. */
  2101. static int fib_trie_seq_show(struct seq_file *seq, void *v)
  2102. {
  2103. char bf[128];
  2104. if (v == SEQ_START_TOKEN) {
  2105. if (trie_local)
  2106. trie_dump_seq(seq, trie_local);
  2107. if (trie_main)
  2108. trie_dump_seq(seq, trie_main);
  2109. }
  2110. else {
  2111. snprintf(bf, sizeof(bf),
  2112. "*\t%08X\t%08X", 200, 400);
  2113. seq_printf(seq, "%-127s\n", bf);
  2114. }
  2115. return 0;
  2116. }
  2117. static struct seq_operations fib_trie_seq_ops = {
  2118. .start = fib_trie_seq_start,
  2119. .next = fib_trie_seq_next,
  2120. .stop = fib_trie_seq_stop,
  2121. .show = fib_trie_seq_show,
  2122. };
  2123. static int fib_trie_seq_open(struct inode *inode, struct file *file)
  2124. {
  2125. struct seq_file *seq;
  2126. int rc = -ENOMEM;
  2127. rc = seq_open(file, &fib_trie_seq_ops);
  2128. if (rc)
  2129. goto out_kfree;
  2130. seq = file->private_data;
  2131. out:
  2132. return rc;
  2133. out_kfree:
  2134. goto out;
  2135. }
  2136. static struct file_operations fib_trie_seq_fops = {
  2137. .owner = THIS_MODULE,
  2138. .open = fib_trie_seq_open,
  2139. .read = seq_read,
  2140. .llseek = seq_lseek,
  2141. .release = seq_release_private,
  2142. };
  2143. int __init fib_proc_init(void)
  2144. {
  2145. if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
  2146. return -ENOMEM;
  2147. return 0;
  2148. }
  2149. void __init fib_proc_exit(void)
  2150. {
  2151. proc_net_remove("fib_trie");
  2152. }
  2153. #endif /* CONFIG_PROC_FS */