]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/sched/sch_api.c
[NETEM]: avoid excessive requeues
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4
LT
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
1da177e4
LT
30#include <linux/init.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/kmod.h>
34#include <linux/list.h>
35#include <linux/bitops.h>
4179477f 36#include <linux/hrtimer.h>
1da177e4 37
dc5fc579 38#include <net/netlink.h>
1da177e4
LT
39#include <net/sock.h>
40#include <net/pkt_sched.h>
41
42#include <asm/processor.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
50
51/*
52
53 Short review.
54 -------------
55
56 This file consists of two interrelated parts:
57
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
60
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
65
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
70
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
73
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
79
80 All real intelligent work is done inside qdisc modules.
81
82
83
84 Every discipline has two major routines: enqueue and dequeue.
85
86 ---dequeue
87
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
94
95 ---enqueue
96
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
106
107 Auxiliary routines:
108
109 ---requeue
110
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
113
114 ---reset
115
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
118
119 ---init
120
121 initializes newly created qdisc.
122
123 ---destroy
124
125 destroys resources allocated by init and during lifetime of qdisc.
126
127 ---change
128
129 changes qdisc parameters.
130 */
131
132/* Protects list of registered TC modules. It is pure SMP lock. */
133static DEFINE_RWLOCK(qdisc_mod_lock);
134
135
136/************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
139
140
141/* The list of all installed queueing disciplines. */
142
143static struct Qdisc_ops *qdisc_base;
144
145/* Register/uregister queueing discipline */
146
147int register_qdisc(struct Qdisc_ops *qops)
148{
149 struct Qdisc_ops *q, **qp;
150 int rc = -EEXIST;
151
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
155 goto out;
156
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
163
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
170}
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
176
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
185 }
186 write_unlock(&qdisc_mod_lock);
187 return err;
188}
189
190/* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
192 */
193
43effa1e 194static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4
LT
195{
196 struct Qdisc *q;
197
1da177e4 198 list_for_each_entry(q, &dev->qdisc_list, list) {
43effa1e 199 if (q->handle == handle)
1da177e4 200 return q;
1da177e4 201 }
1da177e4
LT
202 return NULL;
203}
204
43effa1e
PM
205struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
206{
207 struct Qdisc *q;
208
209 read_lock(&qdisc_tree_lock);
210 q = __qdisc_lookup(dev, handle);
211 read_unlock(&qdisc_tree_lock);
212 return q;
213}
214
1da177e4
LT
215static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
216{
217 unsigned long cl;
218 struct Qdisc *leaf;
219 struct Qdisc_class_ops *cops = p->ops->cl_ops;
220
221 if (cops == NULL)
222 return NULL;
223 cl = cops->get(p, classid);
224
225 if (cl == 0)
226 return NULL;
227 leaf = cops->leaf(p, cl);
228 cops->put(p, cl);
229 return leaf;
230}
231
232/* Find queueing discipline by name */
233
234static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
235{
236 struct Qdisc_ops *q = NULL;
237
238 if (kind) {
239 read_lock(&qdisc_mod_lock);
240 for (q = qdisc_base; q; q = q->next) {
241 if (rtattr_strcmp(kind, q->id) == 0) {
242 if (!try_module_get(q->owner))
243 q = NULL;
244 break;
245 }
246 }
247 read_unlock(&qdisc_mod_lock);
248 }
249 return q;
250}
251
252static struct qdisc_rate_table *qdisc_rtab_list;
253
254struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
255{
256 struct qdisc_rate_table *rtab;
257
258 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
259 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
260 rtab->refcnt++;
261 return rtab;
262 }
263 }
264
265 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
266 return NULL;
267
268 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
269 if (rtab) {
270 rtab->rate = *r;
271 rtab->refcnt = 1;
272 memcpy(rtab->data, RTA_DATA(tab), 1024);
273 rtab->next = qdisc_rtab_list;
274 qdisc_rtab_list = rtab;
275 }
276 return rtab;
277}
278
279void qdisc_put_rtab(struct qdisc_rate_table *tab)
280{
281 struct qdisc_rate_table *rtab, **rtabp;
282
283 if (!tab || --tab->refcnt)
284 return;
285
286 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
287 if (rtab == tab) {
288 *rtabp = rtab->next;
289 kfree(rtab);
290 return;
291 }
292 }
293}
294
4179477f
PM
295static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
296{
297 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
298 timer);
299
300 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
11274e5a 301 smp_wmb();
4179477f
PM
302 netif_schedule(wd->qdisc->dev);
303 return HRTIMER_NORESTART;
304}
305
306void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
307{
308 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
309 wd->timer.function = qdisc_watchdog;
310 wd->qdisc = qdisc;
311}
312EXPORT_SYMBOL(qdisc_watchdog_init);
313
314void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
315{
316 ktime_t time;
317
318 wd->qdisc->flags |= TCQ_F_THROTTLED;
11274e5a 319 smp_wmb();
4179477f
PM
320 time = ktime_set(0, 0);
321 time = ktime_add_ns(time, PSCHED_US2NS(expires));
322 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
323}
324EXPORT_SYMBOL(qdisc_watchdog_schedule);
325
326void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
327{
328 hrtimer_cancel(&wd->timer);
329 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
11274e5a 330 smp_wmb();
4179477f
PM
331}
332EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4
LT
333
334/* Allocate an unique handle from space managed by kernel */
335
336static u32 qdisc_alloc_handle(struct net_device *dev)
337{
338 int i = 0x10000;
339 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
340
341 do {
342 autohandle += TC_H_MAKE(0x10000U, 0);
343 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
344 autohandle = TC_H_MAKE(0x80000000U, 0);
345 } while (qdisc_lookup(dev, autohandle) && --i > 0);
346
347 return i>0 ? autohandle : 0;
348}
349
350/* Attach toplevel qdisc to device dev */
351
352static struct Qdisc *
353dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
354{
355 struct Qdisc *oqdisc;
356
357 if (dev->flags & IFF_UP)
358 dev_deactivate(dev);
359
360 qdisc_lock_tree(dev);
361 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
362 oqdisc = dev->qdisc_ingress;
363 /* Prune old scheduler */
364 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
365 /* delete */
366 qdisc_reset(oqdisc);
367 dev->qdisc_ingress = NULL;
368 } else { /* new */
369 dev->qdisc_ingress = qdisc;
370 }
371
372 } else {
373
374 oqdisc = dev->qdisc_sleeping;
375
376 /* Prune old scheduler */
377 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
378 qdisc_reset(oqdisc);
379
380 /* ... and graft new one */
381 if (qdisc == NULL)
382 qdisc = &noop_qdisc;
383 dev->qdisc_sleeping = qdisc;
384 dev->qdisc = &noop_qdisc;
385 }
386
387 qdisc_unlock_tree(dev);
388
389 if (dev->flags & IFF_UP)
390 dev_activate(dev);
391
392 return oqdisc;
393}
394
43effa1e
PM
395void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
396{
397 struct Qdisc_class_ops *cops;
398 unsigned long cl;
399 u32 parentid;
400
401 if (n == 0)
402 return;
403 while ((parentid = sch->parent)) {
404 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
405 cops = sch->ops->cl_ops;
406 if (cops->qlen_notify) {
407 cl = cops->get(sch, parentid);
408 cops->qlen_notify(sch, cl);
409 cops->put(sch, cl);
410 }
411 sch->q.qlen -= n;
412 }
413}
414EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4
LT
415
416/* Graft qdisc "new" to class "classid" of qdisc "parent" or
417 to device "dev".
418
419 Old qdisc is not destroyed but returned in *old.
420 */
421
422static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
423 u32 classid,
424 struct Qdisc *new, struct Qdisc **old)
425{
426 int err = 0;
427 struct Qdisc *q = *old;
428
429
10297b99 430 if (parent == NULL) {
1da177e4
LT
431 if (q && q->flags&TCQ_F_INGRESS) {
432 *old = dev_graft_qdisc(dev, q);
433 } else {
434 *old = dev_graft_qdisc(dev, new);
435 }
436 } else {
437 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
438
439 err = -EINVAL;
440
441 if (cops) {
442 unsigned long cl = cops->get(parent, classid);
443 if (cl) {
444 err = cops->graft(parent, cl, new, old);
445 if (new)
446 new->parent = classid;
447 cops->put(parent, cl);
448 }
449 }
450 }
451 return err;
452}
453
454/*
455 Allocate and initialize new qdisc.
456
457 Parameters are passed via opt.
458 */
459
460static struct Qdisc *
461qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
462{
463 int err;
464 struct rtattr *kind = tca[TCA_KIND-1];
1da177e4
LT
465 struct Qdisc *sch;
466 struct Qdisc_ops *ops;
1da177e4
LT
467
468 ops = qdisc_lookup_ops(kind);
469#ifdef CONFIG_KMOD
470 if (ops == NULL && kind != NULL) {
471 char name[IFNAMSIZ];
472 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
473 /* We dropped the RTNL semaphore in order to
474 * perform the module load. So, even if we
475 * succeeded in loading the module we have to
476 * tell the caller to replay the request. We
477 * indicate this using -EAGAIN.
478 * We replay the request because the device may
479 * go away in the mean time.
480 */
481 rtnl_unlock();
482 request_module("sch_%s", name);
483 rtnl_lock();
484 ops = qdisc_lookup_ops(kind);
485 if (ops != NULL) {
486 /* We will try again qdisc_lookup_ops,
487 * so don't keep a reference.
488 */
489 module_put(ops->owner);
490 err = -EAGAIN;
491 goto err_out;
492 }
493 }
494 }
495#endif
496
b9e2cc0f 497 err = -ENOENT;
1da177e4
LT
498 if (ops == NULL)
499 goto err_out;
500
3d54b82f
TG
501 sch = qdisc_alloc(dev, ops);
502 if (IS_ERR(sch)) {
503 err = PTR_ERR(sch);
1da177e4 504 goto err_out2;
3d54b82f 505 }
1da177e4 506
3d54b82f 507 if (handle == TC_H_INGRESS) {
1da177e4 508 sch->flags |= TCQ_F_INGRESS;
3d54b82f
TG
509 handle = TC_H_MAKE(TC_H_INGRESS, 0);
510 } else if (handle == 0) {
1da177e4
LT
511 handle = qdisc_alloc_handle(dev);
512 err = -ENOMEM;
513 if (handle == 0)
514 goto err_out3;
515 }
516
3d54b82f 517 sch->handle = handle;
1da177e4
LT
518
519 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
023e09a7
TG
520#ifdef CONFIG_NET_ESTIMATOR
521 if (tca[TCA_RATE-1]) {
522 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
523 sch->stats_lock,
524 tca[TCA_RATE-1]);
525 if (err) {
526 /*
527 * Any broken qdiscs that would require
528 * a ops->reset() here? The qdisc was never
529 * in action so it shouldn't be necessary.
530 */
531 if (ops->destroy)
532 ops->destroy(sch);
533 goto err_out3;
534 }
535 }
536#endif
1da177e4
LT
537 qdisc_lock_tree(dev);
538 list_add_tail(&sch->list, &dev->qdisc_list);
539 qdisc_unlock_tree(dev);
540
1da177e4
LT
541 return sch;
542 }
543err_out3:
544 dev_put(dev);
3d54b82f 545 kfree((char *) sch - sch->padded);
1da177e4
LT
546err_out2:
547 module_put(ops->owner);
548err_out:
549 *errp = err;
1da177e4
LT
550 return NULL;
551}
552
553static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
554{
555 if (tca[TCA_OPTIONS-1]) {
556 int err;
557
558 if (sch->ops->change == NULL)
559 return -EINVAL;
560 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
561 if (err)
562 return err;
563 }
564#ifdef CONFIG_NET_ESTIMATOR
565 if (tca[TCA_RATE-1])
566 gen_replace_estimator(&sch->bstats, &sch->rate_est,
567 sch->stats_lock, tca[TCA_RATE-1]);
568#endif
569 return 0;
570}
571
572struct check_loop_arg
573{
574 struct qdisc_walker w;
575 struct Qdisc *p;
576 int depth;
577};
578
579static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
580
581static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
582{
583 struct check_loop_arg arg;
584
585 if (q->ops->cl_ops == NULL)
586 return 0;
587
588 arg.w.stop = arg.w.skip = arg.w.count = 0;
589 arg.w.fn = check_loop_fn;
590 arg.depth = depth;
591 arg.p = p;
592 q->ops->cl_ops->walk(q, &arg.w);
593 return arg.w.stop ? -ELOOP : 0;
594}
595
596static int
597check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
598{
599 struct Qdisc *leaf;
600 struct Qdisc_class_ops *cops = q->ops->cl_ops;
601 struct check_loop_arg *arg = (struct check_loop_arg *)w;
602
603 leaf = cops->leaf(q, cl);
604 if (leaf) {
605 if (leaf == arg->p || arg->depth > 7)
606 return -ELOOP;
607 return check_loop(leaf, arg->p, arg->depth + 1);
608 }
609 return 0;
610}
611
612/*
613 * Delete/get qdisc.
614 */
615
616static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
617{
618 struct tcmsg *tcm = NLMSG_DATA(n);
619 struct rtattr **tca = arg;
620 struct net_device *dev;
621 u32 clid = tcm->tcm_parent;
622 struct Qdisc *q = NULL;
623 struct Qdisc *p = NULL;
624 int err;
625
626 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
627 return -ENODEV;
628
629 if (clid) {
630 if (clid != TC_H_ROOT) {
631 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
632 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
633 return -ENOENT;
634 q = qdisc_leaf(p, clid);
635 } else { /* ingress */
636 q = dev->qdisc_ingress;
10297b99 637 }
1da177e4
LT
638 } else {
639 q = dev->qdisc_sleeping;
640 }
641 if (!q)
642 return -ENOENT;
643
644 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
645 return -EINVAL;
646 } else {
647 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
648 return -ENOENT;
649 }
650
651 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
652 return -EINVAL;
653
654 if (n->nlmsg_type == RTM_DELQDISC) {
655 if (!clid)
656 return -EINVAL;
657 if (q->handle == 0)
658 return -ENOENT;
659 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
660 return err;
661 if (q) {
662 qdisc_notify(skb, n, clid, q, NULL);
663 spin_lock_bh(&dev->queue_lock);
664 qdisc_destroy(q);
665 spin_unlock_bh(&dev->queue_lock);
666 }
667 } else {
668 qdisc_notify(skb, n, clid, NULL, q);
669 }
670 return 0;
671}
672
673/*
674 Create/change qdisc.
675 */
676
677static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
678{
679 struct tcmsg *tcm;
680 struct rtattr **tca;
681 struct net_device *dev;
682 u32 clid;
683 struct Qdisc *q, *p;
684 int err;
685
686replay:
687 /* Reinit, just in case something touches this. */
688 tcm = NLMSG_DATA(n);
689 tca = arg;
690 clid = tcm->tcm_parent;
691 q = p = NULL;
692
693 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
694 return -ENODEV;
695
696 if (clid) {
697 if (clid != TC_H_ROOT) {
698 if (clid != TC_H_INGRESS) {
699 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
700 return -ENOENT;
701 q = qdisc_leaf(p, clid);
702 } else { /*ingress */
703 q = dev->qdisc_ingress;
704 }
705 } else {
706 q = dev->qdisc_sleeping;
707 }
708
709 /* It may be default qdisc, ignore it */
710 if (q && q->handle == 0)
711 q = NULL;
712
713 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
714 if (tcm->tcm_handle) {
715 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
716 return -EEXIST;
717 if (TC_H_MIN(tcm->tcm_handle))
718 return -EINVAL;
719 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
720 goto create_n_graft;
721 if (n->nlmsg_flags&NLM_F_EXCL)
722 return -EEXIST;
723 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
724 return -EINVAL;
725 if (q == p ||
726 (p && check_loop(q, p, 0)))
727 return -ELOOP;
728 atomic_inc(&q->refcnt);
729 goto graft;
730 } else {
731 if (q == NULL)
732 goto create_n_graft;
733
734 /* This magic test requires explanation.
735 *
736 * We know, that some child q is already
737 * attached to this parent and have choice:
738 * either to change it or to create/graft new one.
739 *
740 * 1. We are allowed to create/graft only
741 * if CREATE and REPLACE flags are set.
742 *
743 * 2. If EXCL is set, requestor wanted to say,
744 * that qdisc tcm_handle is not expected
745 * to exist, so that we choose create/graft too.
746 *
747 * 3. The last case is when no flags are set.
748 * Alas, it is sort of hole in API, we
749 * cannot decide what to do unambiguously.
750 * For now we select create/graft, if
751 * user gave KIND, which does not match existing.
752 */
753 if ((n->nlmsg_flags&NLM_F_CREATE) &&
754 (n->nlmsg_flags&NLM_F_REPLACE) &&
755 ((n->nlmsg_flags&NLM_F_EXCL) ||
756 (tca[TCA_KIND-1] &&
757 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
758 goto create_n_graft;
759 }
760 }
761 } else {
762 if (!tcm->tcm_handle)
763 return -EINVAL;
764 q = qdisc_lookup(dev, tcm->tcm_handle);
765 }
766
767 /* Change qdisc parameters */
768 if (q == NULL)
769 return -ENOENT;
770 if (n->nlmsg_flags&NLM_F_EXCL)
771 return -EEXIST;
772 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
773 return -EINVAL;
774 err = qdisc_change(q, tca);
775 if (err == 0)
776 qdisc_notify(skb, n, clid, NULL, q);
777 return err;
778
779create_n_graft:
780 if (!(n->nlmsg_flags&NLM_F_CREATE))
781 return -ENOENT;
782 if (clid == TC_H_INGRESS)
783 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
10297b99 784 else
1da177e4
LT
785 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
786 if (q == NULL) {
787 if (err == -EAGAIN)
788 goto replay;
789 return err;
790 }
791
792graft:
793 if (1) {
794 struct Qdisc *old_q = NULL;
795 err = qdisc_graft(dev, p, clid, q, &old_q);
796 if (err) {
797 if (q) {
798 spin_lock_bh(&dev->queue_lock);
799 qdisc_destroy(q);
800 spin_unlock_bh(&dev->queue_lock);
801 }
802 return err;
803 }
804 qdisc_notify(skb, n, clid, old_q, q);
805 if (old_q) {
806 spin_lock_bh(&dev->queue_lock);
807 qdisc_destroy(old_q);
808 spin_unlock_bh(&dev->queue_lock);
809 }
810 }
811 return 0;
812}
813
814static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 815 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
816{
817 struct tcmsg *tcm;
818 struct nlmsghdr *nlh;
27a884dc 819 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
820 struct gnet_dump d;
821
e431b8c0 822 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
823 tcm = NLMSG_DATA(nlh);
824 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
825 tcm->tcm__pad1 = 0;
826 tcm->tcm__pad2 = 0;
1da177e4
LT
827 tcm->tcm_ifindex = q->dev->ifindex;
828 tcm->tcm_parent = clid;
829 tcm->tcm_handle = q->handle;
830 tcm->tcm_info = atomic_read(&q->refcnt);
831 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
832 if (q->ops->dump && q->ops->dump(q, skb) < 0)
833 goto rtattr_failure;
834 q->qstats.qlen = q->q.qlen;
835
836 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
837 TCA_XSTATS, q->stats_lock, &d) < 0)
838 goto rtattr_failure;
839
840 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
841 goto rtattr_failure;
842
843 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
844#ifdef CONFIG_NET_ESTIMATOR
845 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
846#endif
847 gnet_stats_copy_queue(&d, &q->qstats) < 0)
848 goto rtattr_failure;
10297b99 849
1da177e4
LT
850 if (gnet_stats_finish_copy(&d) < 0)
851 goto rtattr_failure;
10297b99 852
27a884dc 853 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
854 return skb->len;
855
856nlmsg_failure:
857rtattr_failure:
dc5fc579 858 nlmsg_trim(skb, b);
1da177e4
LT
859 return -1;
860}
861
862static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
863 u32 clid, struct Qdisc *old, struct Qdisc *new)
864{
865 struct sk_buff *skb;
866 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
867
868 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
869 if (!skb)
870 return -ENOBUFS;
871
872 if (old && old->handle) {
873 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
874 goto err_out;
875 }
876 if (new) {
877 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
878 goto err_out;
879 }
880
881 if (skb->len)
ac6d439d 882 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
883
884err_out:
885 kfree_skb(skb);
886 return -EINVAL;
887}
888
889static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
890{
891 int idx, q_idx;
892 int s_idx, s_q_idx;
893 struct net_device *dev;
894 struct Qdisc *q;
895
896 s_idx = cb->args[0];
897 s_q_idx = q_idx = cb->args[1];
898 read_lock(&dev_base_lock);
899 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
900 if (idx < s_idx)
901 continue;
902 if (idx > s_idx)
903 s_q_idx = 0;
85670cc1 904 read_lock(&qdisc_tree_lock);
1da177e4
LT
905 q_idx = 0;
906 list_for_each_entry(q, &dev->qdisc_list, list) {
907 if (q_idx < s_q_idx) {
908 q_idx++;
909 continue;
910 }
911 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
912 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
85670cc1 913 read_unlock(&qdisc_tree_lock);
1da177e4
LT
914 goto done;
915 }
916 q_idx++;
917 }
85670cc1 918 read_unlock(&qdisc_tree_lock);
1da177e4
LT
919 }
920
921done:
922 read_unlock(&dev_base_lock);
923
924 cb->args[0] = idx;
925 cb->args[1] = q_idx;
926
927 return skb->len;
928}
929
930
931
932/************************************************
933 * Traffic classes manipulation. *
934 ************************************************/
935
936
937
938static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
939{
940 struct tcmsg *tcm = NLMSG_DATA(n);
941 struct rtattr **tca = arg;
942 struct net_device *dev;
943 struct Qdisc *q = NULL;
944 struct Qdisc_class_ops *cops;
945 unsigned long cl = 0;
946 unsigned long new_cl;
947 u32 pid = tcm->tcm_parent;
948 u32 clid = tcm->tcm_handle;
949 u32 qid = TC_H_MAJ(clid);
950 int err;
951
952 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
953 return -ENODEV;
954
955 /*
956 parent == TC_H_UNSPEC - unspecified parent.
957 parent == TC_H_ROOT - class is root, which has no parent.
958 parent == X:0 - parent is root class.
959 parent == X:Y - parent is a node in hierarchy.
960 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
961
962 handle == 0:0 - generate handle from kernel pool.
963 handle == 0:Y - class is X:Y, where X:0 is qdisc.
964 handle == X:Y - clear.
965 handle == X:0 - root class.
966 */
967
968 /* Step 1. Determine qdisc handle X:0 */
969
970 if (pid != TC_H_ROOT) {
971 u32 qid1 = TC_H_MAJ(pid);
972
973 if (qid && qid1) {
974 /* If both majors are known, they must be identical. */
975 if (qid != qid1)
976 return -EINVAL;
977 } else if (qid1) {
978 qid = qid1;
979 } else if (qid == 0)
980 qid = dev->qdisc_sleeping->handle;
981
982 /* Now qid is genuine qdisc handle consistent
983 both with parent and child.
984
985 TC_H_MAJ(pid) still may be unspecified, complete it now.
986 */
987 if (pid)
988 pid = TC_H_MAKE(qid, pid);
989 } else {
990 if (qid == 0)
991 qid = dev->qdisc_sleeping->handle;
992 }
993
994 /* OK. Locate qdisc */
10297b99 995 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
996 return -ENOENT;
997
998 /* An check that it supports classes */
999 cops = q->ops->cl_ops;
1000 if (cops == NULL)
1001 return -EINVAL;
1002
1003 /* Now try to get class */
1004 if (clid == 0) {
1005 if (pid == TC_H_ROOT)
1006 clid = qid;
1007 } else
1008 clid = TC_H_MAKE(qid, clid);
1009
1010 if (clid)
1011 cl = cops->get(q, clid);
1012
1013 if (cl == 0) {
1014 err = -ENOENT;
1015 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1016 goto out;
1017 } else {
1018 switch (n->nlmsg_type) {
10297b99 1019 case RTM_NEWTCLASS:
1da177e4
LT
1020 err = -EEXIST;
1021 if (n->nlmsg_flags&NLM_F_EXCL)
1022 goto out;
1023 break;
1024 case RTM_DELTCLASS:
1025 err = cops->delete(q, cl);
1026 if (err == 0)
1027 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1028 goto out;
1029 case RTM_GETTCLASS:
1030 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1031 goto out;
1032 default:
1033 err = -EINVAL;
1034 goto out;
1035 }
1036 }
1037
1038 new_cl = cl;
1039 err = cops->change(q, clid, pid, tca, &new_cl);
1040 if (err == 0)
1041 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1042
1043out:
1044 if (cl)
1045 cops->put(q, cl);
1046
1047 return err;
1048}
1049
1050
1051static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1052 unsigned long cl,
e431b8c0 1053 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1054{
1055 struct tcmsg *tcm;
1056 struct nlmsghdr *nlh;
27a884dc 1057 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1058 struct gnet_dump d;
1059 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1060
e431b8c0 1061 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1062 tcm = NLMSG_DATA(nlh);
1063 tcm->tcm_family = AF_UNSPEC;
1064 tcm->tcm_ifindex = q->dev->ifindex;
1065 tcm->tcm_parent = q->handle;
1066 tcm->tcm_handle = q->handle;
1067 tcm->tcm_info = 0;
1068 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1069 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1070 goto rtattr_failure;
1071
1072 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1073 TCA_XSTATS, q->stats_lock, &d) < 0)
1074 goto rtattr_failure;
1075
1076 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1077 goto rtattr_failure;
1078
1079 if (gnet_stats_finish_copy(&d) < 0)
1080 goto rtattr_failure;
1081
27a884dc 1082 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1083 return skb->len;
1084
1085nlmsg_failure:
1086rtattr_failure:
dc5fc579 1087 nlmsg_trim(skb, b);
1da177e4
LT
1088 return -1;
1089}
1090
1091static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1092 struct Qdisc *q, unsigned long cl, int event)
1093{
1094 struct sk_buff *skb;
1095 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1096
1097 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1098 if (!skb)
1099 return -ENOBUFS;
1100
1101 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1102 kfree_skb(skb);
1103 return -EINVAL;
1104 }
1105
ac6d439d 1106 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1107}
1108
1109struct qdisc_dump_args
1110{
1111 struct qdisc_walker w;
1112 struct sk_buff *skb;
1113 struct netlink_callback *cb;
1114};
1115
1116static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1117{
1118 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1119
1120 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1121 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1122}
1123
1124static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1125{
1126 int t;
1127 int s_t;
1128 struct net_device *dev;
1129 struct Qdisc *q;
1130 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1131 struct qdisc_dump_args arg;
1132
1133 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1134 return 0;
1135 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1136 return 0;
1137
1138 s_t = cb->args[0];
1139 t = 0;
1140
85670cc1 1141 read_lock(&qdisc_tree_lock);
1da177e4
LT
1142 list_for_each_entry(q, &dev->qdisc_list, list) {
1143 if (t < s_t || !q->ops->cl_ops ||
1144 (tcm->tcm_parent &&
1145 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1146 t++;
1147 continue;
1148 }
1149 if (t > s_t)
1150 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1151 arg.w.fn = qdisc_class_dump;
1152 arg.skb = skb;
1153 arg.cb = cb;
1154 arg.w.stop = 0;
1155 arg.w.skip = cb->args[1];
1156 arg.w.count = 0;
1157 q->ops->cl_ops->walk(q, &arg.w);
1158 cb->args[1] = arg.w.count;
1159 if (arg.w.stop)
1160 break;
1161 t++;
1162 }
85670cc1 1163 read_unlock(&qdisc_tree_lock);
1da177e4
LT
1164
1165 cb->args[0] = t;
1166
1167 dev_put(dev);
1168 return skb->len;
1169}
1170
1171/* Main classifier routine: scans classifier chain attached
1172 to this qdisc, (optionally) tests for protocol and asks
1173 specific classifiers.
1174 */
1175int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1176 struct tcf_result *res)
1177{
1178 int err = 0;
66c6f529 1179 __be16 protocol = skb->protocol;
1da177e4
LT
1180#ifdef CONFIG_NET_CLS_ACT
1181 struct tcf_proto *otp = tp;
1182reclassify:
1183#endif
1184 protocol = skb->protocol;
1185
1186 for ( ; tp; tp = tp->next) {
1187 if ((tp->protocol == protocol ||
b6d9bcb0 1188 tp->protocol == htons(ETH_P_ALL)) &&
1da177e4
LT
1189 (err = tp->classify(skb, tp, res)) >= 0) {
1190#ifdef CONFIG_NET_CLS_ACT
1191 if ( TC_ACT_RECLASSIFY == err) {
1192 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1193 tp = otp;
1194
1195 if (MAX_REC_LOOP < verd++) {
1196 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1197 tp->prio&0xffff, ntohs(tp->protocol));
1198 return TC_ACT_SHOT;
1199 }
1200 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1201 goto reclassify;
1202 } else {
10297b99 1203 if (skb->tc_verd)
1da177e4
LT
1204 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1205 return err;
1206 }
1207#else
1208
1209 return err;
1210#endif
1211 }
1212
1213 }
1214 return -1;
1215}
1216
1da177e4
LT
1217#ifdef CONFIG_PROC_FS
1218static int psched_show(struct seq_file *seq, void *v)
1219{
1220 seq_printf(seq, "%08x %08x %08x %08x\n",
641b9e0e 1221 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
514bca32
PM
1222 1000000,
1223 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES));
1da177e4
LT
1224
1225 return 0;
1226}
1227
1228static int psched_open(struct inode *inode, struct file *file)
1229{
1230 return single_open(file, psched_show, PDE(inode)->data);
1231}
1232
da7071d7 1233static const struct file_operations psched_fops = {
1da177e4
LT
1234 .owner = THIS_MODULE,
1235 .open = psched_open,
1236 .read = seq_read,
1237 .llseek = seq_lseek,
1238 .release = single_release,
10297b99 1239};
1da177e4
LT
1240#endif
1241
1da177e4
LT
1242static int __init pktsched_init(void)
1243{
1da177e4
LT
1244 register_qdisc(&pfifo_qdisc_ops);
1245 register_qdisc(&bfifo_qdisc_ops);
1246 proc_net_fops_create("psched", 0, &psched_fops);
1247
be577ddc
TG
1248 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1249 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1250 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1251 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1252 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1253 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1254
1da177e4
LT
1255 return 0;
1256}
1257
1258subsys_initcall(pktsched_init);
1259
1260EXPORT_SYMBOL(qdisc_get_rtab);
1261EXPORT_SYMBOL(qdisc_put_rtab);
1262EXPORT_SYMBOL(register_qdisc);
1263EXPORT_SYMBOL(unregister_qdisc);
1264EXPORT_SYMBOL(tc_classify);