12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286 |
- /*
- * net/sched/sch_api.c Packet scheduler API.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * Fixes:
- *
- * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
- * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
- * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
- */
- #include <linux/config.h>
- #include <linux/module.h>
- #include <linux/types.h>
- #include <linux/kernel.h>
- #include <linux/sched.h>
- #include <linux/string.h>
- #include <linux/mm.h>
- #include <linux/socket.h>
- #include <linux/sockios.h>
- #include <linux/in.h>
- #include <linux/errno.h>
- #include <linux/interrupt.h>
- #include <linux/netdevice.h>
- #include <linux/skbuff.h>
- #include <linux/rtnetlink.h>
- #include <linux/init.h>
- #include <linux/proc_fs.h>
- #include <linux/seq_file.h>
- #include <linux/kmod.h>
- #include <linux/list.h>
- #include <linux/bitops.h>
- #include <net/sock.h>
- #include <net/pkt_sched.h>
- #include <asm/processor.h>
- #include <asm/uaccess.h>
- #include <asm/system.h>
- static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
- struct Qdisc *old, struct Qdisc *new);
- static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
- struct Qdisc *q, unsigned long cl, int event);
- /*
- Short review.
- -------------
- This file consists of two interrelated parts:
- 1. queueing disciplines manager frontend.
- 2. traffic classes manager frontend.
- Generally, queueing discipline ("qdisc") is a black box,
- which is able to enqueue packets and to dequeue them (when
- device is ready to send something) in order and at times
- determined by algorithm hidden in it.
- qdisc's are divided to two categories:
- - "queues", which have no internal structure visible from outside.
- - "schedulers", which split all the packets to "traffic classes",
- using "packet classifiers" (look at cls_api.c)
- In turn, classes may have child qdiscs (as rule, queues)
- attached to them etc. etc. etc.
- The goal of the routines in this file is to translate
- information supplied by user in the form of handles
- to more intelligible for kernel form, to make some sanity
- checks and part of work, which is common to all qdiscs
- and to provide rtnetlink notifications.
- All real intelligent work is done inside qdisc modules.
- Every discipline has two major routines: enqueue and dequeue.
- ---dequeue
- dequeue usually returns a skb to send. It is allowed to return NULL,
- but it does not mean that queue is empty, it just means that
- discipline does not want to send anything this time.
- Queue is really empty if q->q.qlen == 0.
- For complicated disciplines with multiple queues q->q is not
- real packet queue, but however q->q.qlen must be valid.
- ---enqueue
- enqueue returns 0, if packet was enqueued successfully.
- If packet (this one or another one) was dropped, it returns
- not zero error code.
- NET_XMIT_DROP - this packet dropped
- Expected action: do not backoff, but wait until queue will clear.
- NET_XMIT_CN - probably this packet enqueued, but another one dropped.
- Expected action: backoff or ignore
- NET_XMIT_POLICED - dropped by police.
- Expected action: backoff or error to real-time apps.
- Auxiliary routines:
- ---requeue
- requeues once dequeued packet. It is used for non-standard or
- just buggy devices, which can defer output even if dev->tbusy=0.
- ---reset
- returns qdisc to initial state: purge all buffers, clear all
- timers, counters (except for statistics) etc.
- ---init
- initializes newly created qdisc.
- ---destroy
- destroys resources allocated by init and during lifetime of qdisc.
- ---change
- changes qdisc parameters.
- */
- /* Protects list of registered TC modules. It is pure SMP lock. */
- static DEFINE_RWLOCK(qdisc_mod_lock);
- /************************************************
- * Queueing disciplines manipulation. *
- ************************************************/
- /* The list of all installed queueing disciplines. */
- static struct Qdisc_ops *qdisc_base;
- /* Register/uregister queueing discipline */
- int register_qdisc(struct Qdisc_ops *qops)
- {
- struct Qdisc_ops *q, **qp;
- int rc = -EEXIST;
- write_lock(&qdisc_mod_lock);
- for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
- if (!strcmp(qops->id, q->id))
- goto out;
- if (qops->enqueue == NULL)
- qops->enqueue = noop_qdisc_ops.enqueue;
- if (qops->requeue == NULL)
- qops->requeue = noop_qdisc_ops.requeue;
- if (qops->dequeue == NULL)
- qops->dequeue = noop_qdisc_ops.dequeue;
- qops->next = NULL;
- *qp = qops;
- rc = 0;
- out:
- write_unlock(&qdisc_mod_lock);
- return rc;
- }
- int unregister_qdisc(struct Qdisc_ops *qops)
- {
- struct Qdisc_ops *q, **qp;
- int err = -ENOENT;
- write_lock(&qdisc_mod_lock);
- for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
- if (q == qops)
- break;
- if (q) {
- *qp = q->next;
- q->next = NULL;
- err = 0;
- }
- write_unlock(&qdisc_mod_lock);
- return err;
- }
- /* We know handle. Find qdisc among all qdisc's attached to device
- (root qdisc, all its children, children of children etc.)
- */
- struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
- {
- struct Qdisc *q;
- read_lock_bh(&qdisc_tree_lock);
- list_for_each_entry(q, &dev->qdisc_list, list) {
- if (q->handle == handle) {
- read_unlock_bh(&qdisc_tree_lock);
- return q;
- }
- }
- read_unlock_bh(&qdisc_tree_lock);
- return NULL;
- }
- static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
- {
- unsigned long cl;
- struct Qdisc *leaf;
- struct Qdisc_class_ops *cops = p->ops->cl_ops;
- if (cops == NULL)
- return NULL;
- cl = cops->get(p, classid);
- if (cl == 0)
- return NULL;
- leaf = cops->leaf(p, cl);
- cops->put(p, cl);
- return leaf;
- }
- /* Find queueing discipline by name */
- static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
- {
- struct Qdisc_ops *q = NULL;
- if (kind) {
- read_lock(&qdisc_mod_lock);
- for (q = qdisc_base; q; q = q->next) {
- if (rtattr_strcmp(kind, q->id) == 0) {
- if (!try_module_get(q->owner))
- q = NULL;
- break;
- }
- }
- read_unlock(&qdisc_mod_lock);
- }
- return q;
- }
- static struct qdisc_rate_table *qdisc_rtab_list;
- struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
- {
- struct qdisc_rate_table *rtab;
- for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
- if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
- rtab->refcnt++;
- return rtab;
- }
- }
- if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
- return NULL;
- rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
- if (rtab) {
- rtab->rate = *r;
- rtab->refcnt = 1;
- memcpy(rtab->data, RTA_DATA(tab), 1024);
- rtab->next = qdisc_rtab_list;
- qdisc_rtab_list = rtab;
- }
- return rtab;
- }
- void qdisc_put_rtab(struct qdisc_rate_table *tab)
- {
- struct qdisc_rate_table *rtab, **rtabp;
- if (!tab || --tab->refcnt)
- return;
- for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
- if (rtab == tab) {
- *rtabp = rtab->next;
- kfree(rtab);
- return;
- }
- }
- }
- /* Allocate an unique handle from space managed by kernel */
- static u32 qdisc_alloc_handle(struct net_device *dev)
- {
- int i = 0x10000;
- static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
- do {
- autohandle += TC_H_MAKE(0x10000U, 0);
- if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
- autohandle = TC_H_MAKE(0x80000000U, 0);
- } while (qdisc_lookup(dev, autohandle) && --i > 0);
- return i>0 ? autohandle : 0;
- }
- /* Attach toplevel qdisc to device dev */
- static struct Qdisc *
- dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
- {
- struct Qdisc *oqdisc;
- if (dev->flags & IFF_UP)
- dev_deactivate(dev);
- qdisc_lock_tree(dev);
- if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
- oqdisc = dev->qdisc_ingress;
- /* Prune old scheduler */
- if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
- /* delete */
- qdisc_reset(oqdisc);
- dev->qdisc_ingress = NULL;
- } else { /* new */
- dev->qdisc_ingress = qdisc;
- }
- } else {
- oqdisc = dev->qdisc_sleeping;
- /* Prune old scheduler */
- if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
- qdisc_reset(oqdisc);
- /* ... and graft new one */
- if (qdisc == NULL)
- qdisc = &noop_qdisc;
- dev->qdisc_sleeping = qdisc;
- dev->qdisc = &noop_qdisc;
- }
- qdisc_unlock_tree(dev);
- if (dev->flags & IFF_UP)
- dev_activate(dev);
- return oqdisc;
- }
- /* Graft qdisc "new" to class "classid" of qdisc "parent" or
- to device "dev".
- Old qdisc is not destroyed but returned in *old.
- */
- static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
- u32 classid,
- struct Qdisc *new, struct Qdisc **old)
- {
- int err = 0;
- struct Qdisc *q = *old;
- if (parent == NULL) {
- if (q && q->flags&TCQ_F_INGRESS) {
- *old = dev_graft_qdisc(dev, q);
- } else {
- *old = dev_graft_qdisc(dev, new);
- }
- } else {
- struct Qdisc_class_ops *cops = parent->ops->cl_ops;
- err = -EINVAL;
- if (cops) {
- unsigned long cl = cops->get(parent, classid);
- if (cl) {
- err = cops->graft(parent, cl, new, old);
- if (new)
- new->parent = classid;
- cops->put(parent, cl);
- }
- }
- }
- return err;
- }
- /*
- Allocate and initialize new qdisc.
- Parameters are passed via opt.
- */
- static struct Qdisc *
- qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
- {
- int err;
- struct rtattr *kind = tca[TCA_KIND-1];
- struct Qdisc *sch;
- struct Qdisc_ops *ops;
- ops = qdisc_lookup_ops(kind);
- #ifdef CONFIG_KMOD
- if (ops == NULL && kind != NULL) {
- char name[IFNAMSIZ];
- if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
- /* We dropped the RTNL semaphore in order to
- * perform the module load. So, even if we
- * succeeded in loading the module we have to
- * tell the caller to replay the request. We
- * indicate this using -EAGAIN.
- * We replay the request because the device may
- * go away in the mean time.
- */
- rtnl_unlock();
- request_module("sch_%s", name);
- rtnl_lock();
- ops = qdisc_lookup_ops(kind);
- if (ops != NULL) {
- /* We will try again qdisc_lookup_ops,
- * so don't keep a reference.
- */
- module_put(ops->owner);
- err = -EAGAIN;
- goto err_out;
- }
- }
- }
- #endif
- err = -EINVAL;
- if (ops == NULL)
- goto err_out;
- sch = qdisc_alloc(dev, ops);
- if (IS_ERR(sch)) {
- err = PTR_ERR(sch);
- goto err_out2;
- }
- if (handle == TC_H_INGRESS) {
- sch->flags |= TCQ_F_INGRESS;
- handle = TC_H_MAKE(TC_H_INGRESS, 0);
- } else if (handle == 0) {
- handle = qdisc_alloc_handle(dev);
- err = -ENOMEM;
- if (handle == 0)
- goto err_out3;
- }
- sch->handle = handle;
- if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
- #ifdef CONFIG_NET_ESTIMATOR
- if (tca[TCA_RATE-1]) {
- err = gen_new_estimator(&sch->bstats, &sch->rate_est,
- sch->stats_lock,
- tca[TCA_RATE-1]);
- if (err) {
- /*
- * Any broken qdiscs that would require
- * a ops->reset() here? The qdisc was never
- * in action so it shouldn't be necessary.
- */
- if (ops->destroy)
- ops->destroy(sch);
- goto err_out3;
- }
- }
- #endif
- qdisc_lock_tree(dev);
- list_add_tail(&sch->list, &dev->qdisc_list);
- qdisc_unlock_tree(dev);
- return sch;
- }
- err_out3:
- dev_put(dev);
- kfree((char *) sch - sch->padded);
- err_out2:
- module_put(ops->owner);
- err_out:
- *errp = err;
- return NULL;
- }
- static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
- {
- if (tca[TCA_OPTIONS-1]) {
- int err;
- if (sch->ops->change == NULL)
- return -EINVAL;
- err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
- if (err)
- return err;
- }
- #ifdef CONFIG_NET_ESTIMATOR
- if (tca[TCA_RATE-1])
- gen_replace_estimator(&sch->bstats, &sch->rate_est,
- sch->stats_lock, tca[TCA_RATE-1]);
- #endif
- return 0;
- }
- struct check_loop_arg
- {
- struct qdisc_walker w;
- struct Qdisc *p;
- int depth;
- };
- static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
- static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
- {
- struct check_loop_arg arg;
- if (q->ops->cl_ops == NULL)
- return 0;
- arg.w.stop = arg.w.skip = arg.w.count = 0;
- arg.w.fn = check_loop_fn;
- arg.depth = depth;
- arg.p = p;
- q->ops->cl_ops->walk(q, &arg.w);
- return arg.w.stop ? -ELOOP : 0;
- }
- static int
- check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
- {
- struct Qdisc *leaf;
- struct Qdisc_class_ops *cops = q->ops->cl_ops;
- struct check_loop_arg *arg = (struct check_loop_arg *)w;
- leaf = cops->leaf(q, cl);
- if (leaf) {
- if (leaf == arg->p || arg->depth > 7)
- return -ELOOP;
- return check_loop(leaf, arg->p, arg->depth + 1);
- }
- return 0;
- }
- /*
- * Delete/get qdisc.
- */
- static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
- {
- struct tcmsg *tcm = NLMSG_DATA(n);
- struct rtattr **tca = arg;
- struct net_device *dev;
- u32 clid = tcm->tcm_parent;
- struct Qdisc *q = NULL;
- struct Qdisc *p = NULL;
- int err;
- if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
- return -ENODEV;
- if (clid) {
- if (clid != TC_H_ROOT) {
- if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
- return -ENOENT;
- q = qdisc_leaf(p, clid);
- } else { /* ingress */
- q = dev->qdisc_ingress;
- }
- } else {
- q = dev->qdisc_sleeping;
- }
- if (!q)
- return -ENOENT;
- if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
- return -EINVAL;
- } else {
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
- return -ENOENT;
- }
- if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
- return -EINVAL;
- if (n->nlmsg_type == RTM_DELQDISC) {
- if (!clid)
- return -EINVAL;
- if (q->handle == 0)
- return -ENOENT;
- if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
- return err;
- if (q) {
- qdisc_notify(skb, n, clid, q, NULL);
- spin_lock_bh(&dev->queue_lock);
- qdisc_destroy(q);
- spin_unlock_bh(&dev->queue_lock);
- }
- } else {
- qdisc_notify(skb, n, clid, NULL, q);
- }
- return 0;
- }
- /*
- Create/change qdisc.
- */
- static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
- {
- struct tcmsg *tcm;
- struct rtattr **tca;
- struct net_device *dev;
- u32 clid;
- struct Qdisc *q, *p;
- int err;
- replay:
- /* Reinit, just in case something touches this. */
- tcm = NLMSG_DATA(n);
- tca = arg;
- clid = tcm->tcm_parent;
- q = p = NULL;
- if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
- return -ENODEV;
- if (clid) {
- if (clid != TC_H_ROOT) {
- if (clid != TC_H_INGRESS) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
- return -ENOENT;
- q = qdisc_leaf(p, clid);
- } else { /*ingress */
- q = dev->qdisc_ingress;
- }
- } else {
- q = dev->qdisc_sleeping;
- }
- /* It may be default qdisc, ignore it */
- if (q && q->handle == 0)
- q = NULL;
- if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
- if (tcm->tcm_handle) {
- if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
- return -EEXIST;
- if (TC_H_MIN(tcm->tcm_handle))
- return -EINVAL;
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
- goto create_n_graft;
- if (n->nlmsg_flags&NLM_F_EXCL)
- return -EEXIST;
- if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
- return -EINVAL;
- if (q == p ||
- (p && check_loop(q, p, 0)))
- return -ELOOP;
- atomic_inc(&q->refcnt);
- goto graft;
- } else {
- if (q == NULL)
- goto create_n_graft;
- /* This magic test requires explanation.
- *
- * We know, that some child q is already
- * attached to this parent and have choice:
- * either to change it or to create/graft new one.
- *
- * 1. We are allowed to create/graft only
- * if CREATE and REPLACE flags are set.
- *
- * 2. If EXCL is set, requestor wanted to say,
- * that qdisc tcm_handle is not expected
- * to exist, so that we choose create/graft too.
- *
- * 3. The last case is when no flags are set.
- * Alas, it is sort of hole in API, we
- * cannot decide what to do unambiguously.
- * For now we select create/graft, if
- * user gave KIND, which does not match existing.
- */
- if ((n->nlmsg_flags&NLM_F_CREATE) &&
- (n->nlmsg_flags&NLM_F_REPLACE) &&
- ((n->nlmsg_flags&NLM_F_EXCL) ||
- (tca[TCA_KIND-1] &&
- rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
- goto create_n_graft;
- }
- }
- } else {
- if (!tcm->tcm_handle)
- return -EINVAL;
- q = qdisc_lookup(dev, tcm->tcm_handle);
- }
- /* Change qdisc parameters */
- if (q == NULL)
- return -ENOENT;
- if (n->nlmsg_flags&NLM_F_EXCL)
- return -EEXIST;
- if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
- return -EINVAL;
- err = qdisc_change(q, tca);
- if (err == 0)
- qdisc_notify(skb, n, clid, NULL, q);
- return err;
- create_n_graft:
- if (!(n->nlmsg_flags&NLM_F_CREATE))
- return -ENOENT;
- if (clid == TC_H_INGRESS)
- q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
- else
- q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
- if (q == NULL) {
- if (err == -EAGAIN)
- goto replay;
- return err;
- }
- graft:
- if (1) {
- struct Qdisc *old_q = NULL;
- err = qdisc_graft(dev, p, clid, q, &old_q);
- if (err) {
- if (q) {
- spin_lock_bh(&dev->queue_lock);
- qdisc_destroy(q);
- spin_unlock_bh(&dev->queue_lock);
- }
- return err;
- }
- qdisc_notify(skb, n, clid, old_q, q);
- if (old_q) {
- spin_lock_bh(&dev->queue_lock);
- qdisc_destroy(old_q);
- spin_unlock_bh(&dev->queue_lock);
- }
- }
- return 0;
- }
- static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
- u32 pid, u32 seq, u16 flags, int event)
- {
- struct tcmsg *tcm;
- struct nlmsghdr *nlh;
- unsigned char *b = skb->tail;
- struct gnet_dump d;
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
- tcm = NLMSG_DATA(nlh);
- tcm->tcm_family = AF_UNSPEC;
- tcm->tcm__pad1 = 0;
- tcm->tcm__pad2 = 0;
- tcm->tcm_ifindex = q->dev->ifindex;
- tcm->tcm_parent = clid;
- tcm->tcm_handle = q->handle;
- tcm->tcm_info = atomic_read(&q->refcnt);
- RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
- if (q->ops->dump && q->ops->dump(q, skb) < 0)
- goto rtattr_failure;
- q->qstats.qlen = q->q.qlen;
- if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
- TCA_XSTATS, q->stats_lock, &d) < 0)
- goto rtattr_failure;
- if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
- goto rtattr_failure;
- if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
- #ifdef CONFIG_NET_ESTIMATOR
- gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
- #endif
- gnet_stats_copy_queue(&d, &q->qstats) < 0)
- goto rtattr_failure;
-
- if (gnet_stats_finish_copy(&d) < 0)
- goto rtattr_failure;
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
- nlmsg_failure:
- rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
- }
- static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
- u32 clid, struct Qdisc *old, struct Qdisc *new)
- {
- struct sk_buff *skb;
- u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
- if (old && old->handle) {
- if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
- goto err_out;
- }
- if (new) {
- if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
- goto err_out;
- }
- if (skb->len)
- return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
- err_out:
- kfree_skb(skb);
- return -EINVAL;
- }
- static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
- {
- int idx, q_idx;
- int s_idx, s_q_idx;
- struct net_device *dev;
- struct Qdisc *q;
- s_idx = cb->args[0];
- s_q_idx = q_idx = cb->args[1];
- read_lock(&dev_base_lock);
- for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
- if (idx < s_idx)
- continue;
- if (idx > s_idx)
- s_q_idx = 0;
- read_lock_bh(&qdisc_tree_lock);
- q_idx = 0;
- list_for_each_entry(q, &dev->qdisc_list, list) {
- if (q_idx < s_q_idx) {
- q_idx++;
- continue;
- }
- if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
- read_unlock_bh(&qdisc_tree_lock);
- goto done;
- }
- q_idx++;
- }
- read_unlock_bh(&qdisc_tree_lock);
- }
- done:
- read_unlock(&dev_base_lock);
- cb->args[0] = idx;
- cb->args[1] = q_idx;
- return skb->len;
- }
- /************************************************
- * Traffic classes manipulation. *
- ************************************************/
- static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
- {
- struct tcmsg *tcm = NLMSG_DATA(n);
- struct rtattr **tca = arg;
- struct net_device *dev;
- struct Qdisc *q = NULL;
- struct Qdisc_class_ops *cops;
- unsigned long cl = 0;
- unsigned long new_cl;
- u32 pid = tcm->tcm_parent;
- u32 clid = tcm->tcm_handle;
- u32 qid = TC_H_MAJ(clid);
- int err;
- if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
- return -ENODEV;
- /*
- parent == TC_H_UNSPEC - unspecified parent.
- parent == TC_H_ROOT - class is root, which has no parent.
- parent == X:0 - parent is root class.
- parent == X:Y - parent is a node in hierarchy.
- parent == 0:Y - parent is X:Y, where X:0 is qdisc.
- handle == 0:0 - generate handle from kernel pool.
- handle == 0:Y - class is X:Y, where X:0 is qdisc.
- handle == X:Y - clear.
- handle == X:0 - root class.
- */
- /* Step 1. Determine qdisc handle X:0 */
- if (pid != TC_H_ROOT) {
- u32 qid1 = TC_H_MAJ(pid);
- if (qid && qid1) {
- /* If both majors are known, they must be identical. */
- if (qid != qid1)
- return -EINVAL;
- } else if (qid1) {
- qid = qid1;
- } else if (qid == 0)
- qid = dev->qdisc_sleeping->handle;
- /* Now qid is genuine qdisc handle consistent
- both with parent and child.
- TC_H_MAJ(pid) still may be unspecified, complete it now.
- */
- if (pid)
- pid = TC_H_MAKE(qid, pid);
- } else {
- if (qid == 0)
- qid = dev->qdisc_sleeping->handle;
- }
- /* OK. Locate qdisc */
- if ((q = qdisc_lookup(dev, qid)) == NULL)
- return -ENOENT;
- /* An check that it supports classes */
- cops = q->ops->cl_ops;
- if (cops == NULL)
- return -EINVAL;
- /* Now try to get class */
- if (clid == 0) {
- if (pid == TC_H_ROOT)
- clid = qid;
- } else
- clid = TC_H_MAKE(qid, clid);
- if (clid)
- cl = cops->get(q, clid);
- if (cl == 0) {
- err = -ENOENT;
- if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
- goto out;
- } else {
- switch (n->nlmsg_type) {
- case RTM_NEWTCLASS:
- err = -EEXIST;
- if (n->nlmsg_flags&NLM_F_EXCL)
- goto out;
- break;
- case RTM_DELTCLASS:
- err = cops->delete(q, cl);
- if (err == 0)
- tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
- goto out;
- case RTM_GETTCLASS:
- err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
- goto out;
- default:
- err = -EINVAL;
- goto out;
- }
- }
- new_cl = cl;
- err = cops->change(q, clid, pid, tca, &new_cl);
- if (err == 0)
- tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
- out:
- if (cl)
- cops->put(q, cl);
- return err;
- }
- static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
- unsigned long cl,
- u32 pid, u32 seq, u16 flags, int event)
- {
- struct tcmsg *tcm;
- struct nlmsghdr *nlh;
- unsigned char *b = skb->tail;
- struct gnet_dump d;
- struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
- nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
- tcm = NLMSG_DATA(nlh);
- tcm->tcm_family = AF_UNSPEC;
- tcm->tcm_ifindex = q->dev->ifindex;
- tcm->tcm_parent = q->handle;
- tcm->tcm_handle = q->handle;
- tcm->tcm_info = 0;
- RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
- if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
- goto rtattr_failure;
- if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
- TCA_XSTATS, q->stats_lock, &d) < 0)
- goto rtattr_failure;
- if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
- goto rtattr_failure;
- if (gnet_stats_finish_copy(&d) < 0)
- goto rtattr_failure;
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
- nlmsg_failure:
- rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
- }
- static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
- struct Qdisc *q, unsigned long cl, int event)
- {
- struct sk_buff *skb;
- u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
- if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
- return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
- }
- struct qdisc_dump_args
- {
- struct qdisc_walker w;
- struct sk_buff *skb;
- struct netlink_callback *cb;
- };
- static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
- {
- struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
- return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
- a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
- }
- static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
- {
- int t;
- int s_t;
- struct net_device *dev;
- struct Qdisc *q;
- struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
- struct qdisc_dump_args arg;
- if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
- return 0;
- if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
- return 0;
- s_t = cb->args[0];
- t = 0;
- read_lock_bh(&qdisc_tree_lock);
- list_for_each_entry(q, &dev->qdisc_list, list) {
- if (t < s_t || !q->ops->cl_ops ||
- (tcm->tcm_parent &&
- TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
- t++;
- continue;
- }
- if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
- arg.w.fn = qdisc_class_dump;
- arg.skb = skb;
- arg.cb = cb;
- arg.w.stop = 0;
- arg.w.skip = cb->args[1];
- arg.w.count = 0;
- q->ops->cl_ops->walk(q, &arg.w);
- cb->args[1] = arg.w.count;
- if (arg.w.stop)
- break;
- t++;
- }
- read_unlock_bh(&qdisc_tree_lock);
- cb->args[0] = t;
- dev_put(dev);
- return skb->len;
- }
- /* Main classifier routine: scans classifier chain attached
- to this qdisc, (optionally) tests for protocol and asks
- specific classifiers.
- */
- int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
- struct tcf_result *res)
- {
- int err = 0;
- u32 protocol = skb->protocol;
- #ifdef CONFIG_NET_CLS_ACT
- struct tcf_proto *otp = tp;
- reclassify:
- #endif
- protocol = skb->protocol;
- for ( ; tp; tp = tp->next) {
- if ((tp->protocol == protocol ||
- tp->protocol == __constant_htons(ETH_P_ALL)) &&
- (err = tp->classify(skb, tp, res)) >= 0) {
- #ifdef CONFIG_NET_CLS_ACT
- if ( TC_ACT_RECLASSIFY == err) {
- __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
- tp = otp;
- if (MAX_REC_LOOP < verd++) {
- printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
- tp->prio&0xffff, ntohs(tp->protocol));
- return TC_ACT_SHOT;
- }
- skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
- goto reclassify;
- } else {
- if (skb->tc_verd)
- skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
- return err;
- }
- #else
- return err;
- #endif
- }
- }
- return -1;
- }
- static int psched_us_per_tick = 1;
- static int psched_tick_per_us = 1;
- #ifdef CONFIG_PROC_FS
- static int psched_show(struct seq_file *seq, void *v)
- {
- seq_printf(seq, "%08x %08x %08x %08x\n",
- psched_tick_per_us, psched_us_per_tick,
- 1000000, HZ);
- return 0;
- }
- static int psched_open(struct inode *inode, struct file *file)
- {
- return single_open(file, psched_show, PDE(inode)->data);
- }
- static struct file_operations psched_fops = {
- .owner = THIS_MODULE,
- .open = psched_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- };
- #endif
- #ifdef CONFIG_NET_SCH_CLK_CPU
- psched_tdiff_t psched_clock_per_hz;
- int psched_clock_scale;
- EXPORT_SYMBOL(psched_clock_per_hz);
- EXPORT_SYMBOL(psched_clock_scale);
- psched_time_t psched_time_base;
- cycles_t psched_time_mark;
- EXPORT_SYMBOL(psched_time_mark);
- EXPORT_SYMBOL(psched_time_base);
- /*
- * Periodically adjust psched_time_base to avoid overflow
- * with 32-bit get_cycles(). Safe up to 4GHz CPU.
- */
- static void psched_tick(unsigned long);
- static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
- static void psched_tick(unsigned long dummy)
- {
- if (sizeof(cycles_t) == sizeof(u32)) {
- psched_time_t dummy_stamp;
- PSCHED_GET_TIME(dummy_stamp);
- psched_timer.expires = jiffies + 1*HZ;
- add_timer(&psched_timer);
- }
- }
- int __init psched_calibrate_clock(void)
- {
- psched_time_t stamp, stamp1;
- struct timeval tv, tv1;
- psched_tdiff_t delay;
- long rdelay;
- unsigned long stop;
- psched_tick(0);
- stop = jiffies + HZ/10;
- PSCHED_GET_TIME(stamp);
- do_gettimeofday(&tv);
- while (time_before(jiffies, stop)) {
- barrier();
- cpu_relax();
- }
- PSCHED_GET_TIME(stamp1);
- do_gettimeofday(&tv1);
- delay = PSCHED_TDIFF(stamp1, stamp);
- rdelay = tv1.tv_usec - tv.tv_usec;
- rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
- if (rdelay > delay)
- return -1;
- delay /= rdelay;
- psched_tick_per_us = delay;
- while ((delay>>=1) != 0)
- psched_clock_scale++;
- psched_us_per_tick = 1<<psched_clock_scale;
- psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
- return 0;
- }
- #endif
- static int __init pktsched_init(void)
- {
- struct rtnetlink_link *link_p;
- #ifdef CONFIG_NET_SCH_CLK_CPU
- if (psched_calibrate_clock() < 0)
- return -1;
- #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
- psched_tick_per_us = HZ<<PSCHED_JSCALE;
- psched_us_per_tick = 1000000;
- #endif
- link_p = rtnetlink_links[PF_UNSPEC];
- /* Setup rtnetlink links. It is made here to avoid
- exporting large number of public symbols.
- */
- if (link_p) {
- link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
- link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
- link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
- link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
- link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
- link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
- link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
- link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
- }
- register_qdisc(&pfifo_qdisc_ops);
- register_qdisc(&bfifo_qdisc_ops);
- proc_net_fops_create("psched", 0, &psched_fops);
- return 0;
- }
- subsys_initcall(pktsched_init);
- EXPORT_SYMBOL(qdisc_lookup);
- EXPORT_SYMBOL(qdisc_get_rtab);
- EXPORT_SYMBOL(qdisc_put_rtab);
- EXPORT_SYMBOL(register_qdisc);
- EXPORT_SYMBOL(unregister_qdisc);
- EXPORT_SYMBOL(tc_classify);
|