]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/sched/sch_api.c
sky2: avoid duplicate link up on Optima chip
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4 21#include <linux/string.h>
1da177e4 22#include <linux/errno.h>
1da177e4 23#include <linux/skbuff.h>
1da177e4
LT
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
4179477f 29#include <linux/hrtimer.h>
25bfcd5a 30#include <linux/lockdep.h>
1da177e4 31
457c4cbc 32#include <net/net_namespace.h>
b854272b 33#include <net/sock.h>
dc5fc579 34#include <net/netlink.h>
1da177e4
LT
35#include <net/pkt_sched.h>
36
7316ae88
TG
37static int qdisc_notify(struct net *net, struct sk_buff *oskb,
38 struct nlmsghdr *n, u32 clid,
1da177e4 39 struct Qdisc *old, struct Qdisc *new);
7316ae88
TG
40static int tclass_notify(struct net *net, struct sk_buff *oskb,
41 struct nlmsghdr *n, struct Qdisc *q,
42 unsigned long cl, int event);
1da177e4
LT
43
44/*
45
46 Short review.
47 -------------
48
49 This file consists of two interrelated parts:
50
51 1. queueing disciplines manager frontend.
52 2. traffic classes manager frontend.
53
54 Generally, queueing discipline ("qdisc") is a black box,
55 which is able to enqueue packets and to dequeue them (when
56 device is ready to send something) in order and at times
57 determined by algorithm hidden in it.
58
59 qdisc's are divided to two categories:
60 - "queues", which have no internal structure visible from outside.
61 - "schedulers", which split all the packets to "traffic classes",
62 using "packet classifiers" (look at cls_api.c)
63
64 In turn, classes may have child qdiscs (as rule, queues)
65 attached to them etc. etc. etc.
66
67 The goal of the routines in this file is to translate
68 information supplied by user in the form of handles
69 to more intelligible for kernel form, to make some sanity
70 checks and part of work, which is common to all qdiscs
71 and to provide rtnetlink notifications.
72
73 All real intelligent work is done inside qdisc modules.
74
75
76
77 Every discipline has two major routines: enqueue and dequeue.
78
79 ---dequeue
80
81 dequeue usually returns a skb to send. It is allowed to return NULL,
82 but it does not mean that queue is empty, it just means that
83 discipline does not want to send anything this time.
84 Queue is really empty if q->q.qlen == 0.
85 For complicated disciplines with multiple queues q->q is not
86 real packet queue, but however q->q.qlen must be valid.
87
88 ---enqueue
89
90 enqueue returns 0, if packet was enqueued successfully.
91 If packet (this one or another one) was dropped, it returns
92 not zero error code.
93 NET_XMIT_DROP - this packet dropped
94 Expected action: do not backoff, but wait until queue will clear.
95 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
96 Expected action: backoff or ignore
97 NET_XMIT_POLICED - dropped by police.
98 Expected action: backoff or error to real-time apps.
99
100 Auxiliary routines:
101
99c0db26
JP
102 ---peek
103
104 like dequeue but without removing a packet from the queue
105
1da177e4
LT
106 ---reset
107
108 returns qdisc to initial state: purge all buffers, clear all
109 timers, counters (except for statistics) etc.
110
111 ---init
112
113 initializes newly created qdisc.
114
115 ---destroy
116
117 destroys resources allocated by init and during lifetime of qdisc.
118
119 ---change
120
121 changes qdisc parameters.
122 */
123
124/* Protects list of registered TC modules. It is pure SMP lock. */
125static DEFINE_RWLOCK(qdisc_mod_lock);
126
127
128/************************************************
129 * Queueing disciplines manipulation. *
130 ************************************************/
131
132
133/* The list of all installed queueing disciplines. */
134
135static struct Qdisc_ops *qdisc_base;
136
137/* Register/uregister queueing discipline */
138
139int register_qdisc(struct Qdisc_ops *qops)
140{
141 struct Qdisc_ops *q, **qp;
142 int rc = -EEXIST;
143
144 write_lock(&qdisc_mod_lock);
145 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
146 if (!strcmp(qops->id, q->id))
147 goto out;
148
149 if (qops->enqueue == NULL)
150 qops->enqueue = noop_qdisc_ops.enqueue;
99c0db26
JP
151 if (qops->peek == NULL) {
152 if (qops->dequeue == NULL) {
153 qops->peek = noop_qdisc_ops.peek;
154 } else {
155 rc = -EINVAL;
156 goto out;
157 }
158 }
1da177e4
LT
159 if (qops->dequeue == NULL)
160 qops->dequeue = noop_qdisc_ops.dequeue;
161
162 qops->next = NULL;
163 *qp = qops;
164 rc = 0;
165out:
166 write_unlock(&qdisc_mod_lock);
167 return rc;
168}
62e3ba1b 169EXPORT_SYMBOL(register_qdisc);
1da177e4
LT
170
171int unregister_qdisc(struct Qdisc_ops *qops)
172{
173 struct Qdisc_ops *q, **qp;
174 int err = -ENOENT;
175
176 write_lock(&qdisc_mod_lock);
177 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 if (q == qops)
179 break;
180 if (q) {
181 *qp = q->next;
182 q->next = NULL;
183 err = 0;
184 }
185 write_unlock(&qdisc_mod_lock);
186 return err;
187}
62e3ba1b 188EXPORT_SYMBOL(unregister_qdisc);
1da177e4
LT
189
190/* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
192 */
193
6113b748 194static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
8123b421
DM
195{
196 struct Qdisc *q;
197
198 if (!(root->flags & TCQ_F_BUILTIN) &&
199 root->handle == handle)
200 return root;
201
202 list_for_each_entry(q, &root->list, list) {
203 if (q->handle == handle)
204 return q;
205 }
206 return NULL;
207}
208
f6e0b239
JP
209static void qdisc_list_add(struct Qdisc *q)
210{
f6486d40 211 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
af356afa 212 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
f6e0b239
JP
213}
214
215void qdisc_list_del(struct Qdisc *q)
216{
f6486d40 217 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
f6e0b239 218 list_del(&q->list);
f6e0b239
JP
219}
220EXPORT_SYMBOL(qdisc_list_del);
221
ead81cc5 222struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4 223{
f6e0b239
JP
224 struct Qdisc *q;
225
af356afa
PM
226 q = qdisc_match_from_root(dev->qdisc, handle);
227 if (q)
228 goto out;
f6e0b239
JP
229
230 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
f6486d40 231out:
f6e0b239 232 return q;
1da177e4
LT
233}
234
235static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
236{
237 unsigned long cl;
238 struct Qdisc *leaf;
20fea08b 239 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
1da177e4
LT
240
241 if (cops == NULL)
242 return NULL;
243 cl = cops->get(p, classid);
244
245 if (cl == 0)
246 return NULL;
247 leaf = cops->leaf(p, cl);
248 cops->put(p, cl);
249 return leaf;
250}
251
252/* Find queueing discipline by name */
253
1e90474c 254static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
1da177e4
LT
255{
256 struct Qdisc_ops *q = NULL;
257
258 if (kind) {
259 read_lock(&qdisc_mod_lock);
260 for (q = qdisc_base; q; q = q->next) {
1e90474c 261 if (nla_strcmp(kind, q->id) == 0) {
1da177e4
LT
262 if (!try_module_get(q->owner))
263 q = NULL;
264 break;
265 }
266 }
267 read_unlock(&qdisc_mod_lock);
268 }
269 return q;
270}
271
272static struct qdisc_rate_table *qdisc_rtab_list;
273
1e90474c 274struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
1da177e4
LT
275{
276 struct qdisc_rate_table *rtab;
277
278 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
279 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
280 rtab->refcnt++;
281 return rtab;
282 }
283 }
284
5feb5e1a
PM
285 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
286 nla_len(tab) != TC_RTAB_SIZE)
1da177e4
LT
287 return NULL;
288
289 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
290 if (rtab) {
291 rtab->rate = *r;
292 rtab->refcnt = 1;
1e90474c 293 memcpy(rtab->data, nla_data(tab), 1024);
1da177e4
LT
294 rtab->next = qdisc_rtab_list;
295 qdisc_rtab_list = rtab;
296 }
297 return rtab;
298}
62e3ba1b 299EXPORT_SYMBOL(qdisc_get_rtab);
1da177e4
LT
300
301void qdisc_put_rtab(struct qdisc_rate_table *tab)
302{
303 struct qdisc_rate_table *rtab, **rtabp;
304
305 if (!tab || --tab->refcnt)
306 return;
307
308 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
309 if (rtab == tab) {
310 *rtabp = rtab->next;
311 kfree(rtab);
312 return;
313 }
314 }
315}
62e3ba1b 316EXPORT_SYMBOL(qdisc_put_rtab);
1da177e4 317
175f9c1b
JK
318static LIST_HEAD(qdisc_stab_list);
319static DEFINE_SPINLOCK(qdisc_stab_lock);
320
321static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
322 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
323 [TCA_STAB_DATA] = { .type = NLA_BINARY },
324};
325
326static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
327{
328 struct nlattr *tb[TCA_STAB_MAX + 1];
329 struct qdisc_size_table *stab;
330 struct tc_sizespec *s;
331 unsigned int tsize = 0;
332 u16 *tab = NULL;
333 int err;
334
335 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
336 if (err < 0)
337 return ERR_PTR(err);
338 if (!tb[TCA_STAB_BASE])
339 return ERR_PTR(-EINVAL);
340
341 s = nla_data(tb[TCA_STAB_BASE]);
342
343 if (s->tsize > 0) {
344 if (!tb[TCA_STAB_DATA])
345 return ERR_PTR(-EINVAL);
346 tab = nla_data(tb[TCA_STAB_DATA]);
347 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
348 }
349
350 if (!s || tsize != s->tsize || (!tab && tsize > 0))
351 return ERR_PTR(-EINVAL);
352
f3b9605d 353 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
354
355 list_for_each_entry(stab, &qdisc_stab_list, list) {
356 if (memcmp(&stab->szopts, s, sizeof(*s)))
357 continue;
358 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
359 continue;
360 stab->refcnt++;
f3b9605d 361 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
362 return stab;
363 }
364
f3b9605d 365 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
366
367 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
368 if (!stab)
369 return ERR_PTR(-ENOMEM);
370
371 stab->refcnt = 1;
372 stab->szopts = *s;
373 if (tsize > 0)
374 memcpy(stab->data, tab, tsize * sizeof(u16));
375
f3b9605d 376 spin_lock(&qdisc_stab_lock);
175f9c1b 377 list_add_tail(&stab->list, &qdisc_stab_list);
f3b9605d 378 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
379
380 return stab;
381}
382
383void qdisc_put_stab(struct qdisc_size_table *tab)
384{
385 if (!tab)
386 return;
387
f3b9605d 388 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
389
390 if (--tab->refcnt == 0) {
391 list_del(&tab->list);
392 kfree(tab);
393 }
394
f3b9605d 395 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
396}
397EXPORT_SYMBOL(qdisc_put_stab);
398
399static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
400{
401 struct nlattr *nest;
402
403 nest = nla_nest_start(skb, TCA_STAB);
3aa4614d
PM
404 if (nest == NULL)
405 goto nla_put_failure;
175f9c1b
JK
406 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
407 nla_nest_end(skb, nest);
408
409 return skb->len;
410
411nla_put_failure:
412 return -1;
413}
414
415void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
416{
417 int pkt_len, slot;
418
419 pkt_len = skb->len + stab->szopts.overhead;
420 if (unlikely(!stab->szopts.tsize))
421 goto out;
422
423 slot = pkt_len + stab->szopts.cell_align;
424 if (unlikely(slot < 0))
425 slot = 0;
426
427 slot >>= stab->szopts.cell_log;
428 if (likely(slot < stab->szopts.tsize))
429 pkt_len = stab->data[slot];
430 else
431 pkt_len = stab->data[stab->szopts.tsize - 1] *
432 (slot / stab->szopts.tsize) +
433 stab->data[slot % stab->szopts.tsize];
434
435 pkt_len <<= stab->szopts.size_log;
436out:
437 if (unlikely(pkt_len < 1))
438 pkt_len = 1;
439 qdisc_skb_cb(skb)->pkt_len = pkt_len;
440}
441EXPORT_SYMBOL(qdisc_calculate_pkt_len);
442
b00355db
JP
443void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
444{
445 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
446 printk(KERN_WARNING
447 "%s: %s qdisc %X: is non-work-conserving?\n",
448 txt, qdisc->ops->id, qdisc->handle >> 16);
449 qdisc->flags |= TCQ_F_WARN_NONWC;
450 }
451}
452EXPORT_SYMBOL(qdisc_warn_nonwc);
453
4179477f
PM
454static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
455{
456 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
2fbd3da3 457 timer);
4179477f
PM
458
459 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
8608db03 460 __netif_schedule(qdisc_root(wd->qdisc));
1936502d 461
4179477f
PM
462 return HRTIMER_NORESTART;
463}
464
465void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
466{
2fbd3da3
DM
467 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
468 wd->timer.function = qdisc_watchdog;
4179477f
PM
469 wd->qdisc = qdisc;
470}
471EXPORT_SYMBOL(qdisc_watchdog_init);
472
473void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
474{
475 ktime_t time;
476
2540e051
JP
477 if (test_bit(__QDISC_STATE_DEACTIVATED,
478 &qdisc_root_sleeping(wd->qdisc)->state))
479 return;
480
4179477f
PM
481 wd->qdisc->flags |= TCQ_F_THROTTLED;
482 time = ktime_set(0, 0);
ca44d6e6 483 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
2fbd3da3 484 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
4179477f
PM
485}
486EXPORT_SYMBOL(qdisc_watchdog_schedule);
487
488void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
489{
2fbd3da3 490 hrtimer_cancel(&wd->timer);
4179477f
PM
491 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
492}
493EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4 494
a94f779f 495static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
6fe1c7a5
PM
496{
497 unsigned int size = n * sizeof(struct hlist_head), i;
498 struct hlist_head *h;
499
500 if (size <= PAGE_SIZE)
501 h = kmalloc(size, GFP_KERNEL);
502 else
503 h = (struct hlist_head *)
504 __get_free_pages(GFP_KERNEL, get_order(size));
505
506 if (h != NULL) {
507 for (i = 0; i < n; i++)
508 INIT_HLIST_HEAD(&h[i]);
509 }
510 return h;
511}
512
513static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
514{
515 unsigned int size = n * sizeof(struct hlist_head);
516
517 if (size <= PAGE_SIZE)
518 kfree(h);
519 else
520 free_pages((unsigned long)h, get_order(size));
521}
522
523void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
524{
525 struct Qdisc_class_common *cl;
526 struct hlist_node *n, *next;
527 struct hlist_head *nhash, *ohash;
528 unsigned int nsize, nmask, osize;
529 unsigned int i, h;
530
531 /* Rehash when load factor exceeds 0.75 */
532 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
533 return;
534 nsize = clhash->hashsize * 2;
535 nmask = nsize - 1;
536 nhash = qdisc_class_hash_alloc(nsize);
537 if (nhash == NULL)
538 return;
539
540 ohash = clhash->hash;
541 osize = clhash->hashsize;
542
543 sch_tree_lock(sch);
544 for (i = 0; i < osize; i++) {
545 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
546 h = qdisc_class_hash(cl->classid, nmask);
547 hlist_add_head(&cl->hnode, &nhash[h]);
548 }
549 }
550 clhash->hash = nhash;
551 clhash->hashsize = nsize;
552 clhash->hashmask = nmask;
553 sch_tree_unlock(sch);
554
555 qdisc_class_hash_free(ohash, osize);
556}
557EXPORT_SYMBOL(qdisc_class_hash_grow);
558
559int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
560{
561 unsigned int size = 4;
562
563 clhash->hash = qdisc_class_hash_alloc(size);
564 if (clhash->hash == NULL)
565 return -ENOMEM;
566 clhash->hashsize = size;
567 clhash->hashmask = size - 1;
568 clhash->hashelems = 0;
569 return 0;
570}
571EXPORT_SYMBOL(qdisc_class_hash_init);
572
573void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
574{
575 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
576}
577EXPORT_SYMBOL(qdisc_class_hash_destroy);
578
579void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
580 struct Qdisc_class_common *cl)
581{
582 unsigned int h;
583
584 INIT_HLIST_NODE(&cl->hnode);
585 h = qdisc_class_hash(cl->classid, clhash->hashmask);
586 hlist_add_head(&cl->hnode, &clhash->hash[h]);
587 clhash->hashelems++;
588}
589EXPORT_SYMBOL(qdisc_class_hash_insert);
590
591void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
592 struct Qdisc_class_common *cl)
593{
594 hlist_del(&cl->hnode);
595 clhash->hashelems--;
596}
597EXPORT_SYMBOL(qdisc_class_hash_remove);
598
1da177e4
LT
599/* Allocate an unique handle from space managed by kernel */
600
601static u32 qdisc_alloc_handle(struct net_device *dev)
602{
603 int i = 0x10000;
604 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
605
606 do {
607 autohandle += TC_H_MAKE(0x10000U, 0);
608 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
609 autohandle = TC_H_MAKE(0x80000000U, 0);
610 } while (qdisc_lookup(dev, autohandle) && --i > 0);
611
612 return i>0 ? autohandle : 0;
613}
614
43effa1e
PM
615void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
616{
20fea08b 617 const struct Qdisc_class_ops *cops;
43effa1e
PM
618 unsigned long cl;
619 u32 parentid;
620
621 if (n == 0)
622 return;
623 while ((parentid = sch->parent)) {
066a3b5b
JP
624 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
625 return;
626
5ce2d488 627 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
ffc8fefa
PM
628 if (sch == NULL) {
629 WARN_ON(parentid != TC_H_ROOT);
630 return;
631 }
43effa1e
PM
632 cops = sch->ops->cl_ops;
633 if (cops->qlen_notify) {
634 cl = cops->get(sch, parentid);
635 cops->qlen_notify(sch, cl);
636 cops->put(sch, cl);
637 }
638 sch->q.qlen -= n;
639 }
640}
641EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4 642
7316ae88
TG
643static void notify_and_destroy(struct net *net, struct sk_buff *skb,
644 struct nlmsghdr *n, u32 clid,
99194cff
DM
645 struct Qdisc *old, struct Qdisc *new)
646{
647 if (new || old)
7316ae88 648 qdisc_notify(net, skb, n, clid, old, new);
1da177e4 649
4d8863a2 650 if (old)
99194cff 651 qdisc_destroy(old);
99194cff
DM
652}
653
654/* Graft qdisc "new" to class "classid" of qdisc "parent" or
655 * to device "dev".
656 *
657 * When appropriate send a netlink notification using 'skb'
658 * and "n".
659 *
660 * On success, destroy old qdisc.
1da177e4
LT
661 */
662
663static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
99194cff
DM
664 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
665 struct Qdisc *new, struct Qdisc *old)
1da177e4 666{
99194cff 667 struct Qdisc *q = old;
7316ae88 668 struct net *net = dev_net(dev);
1da177e4 669 int err = 0;
1da177e4 670
10297b99 671 if (parent == NULL) {
99194cff
DM
672 unsigned int i, num_q, ingress;
673
674 ingress = 0;
675 num_q = dev->num_tx_queues;
8d50b53d
DM
676 if ((q && q->flags & TCQ_F_INGRESS) ||
677 (new && new->flags & TCQ_F_INGRESS)) {
99194cff
DM
678 num_q = 1;
679 ingress = 1;
680 }
681
682 if (dev->flags & IFF_UP)
683 dev_deactivate(dev);
684
6ec1c69a
DM
685 if (new && new->ops->attach) {
686 new->ops->attach(new);
687 num_q = 0;
688 }
689
99194cff
DM
690 for (i = 0; i < num_q; i++) {
691 struct netdev_queue *dev_queue = &dev->rx_queue;
692
693 if (!ingress)
694 dev_queue = netdev_get_tx_queue(dev, i);
695
8d50b53d
DM
696 old = dev_graft_qdisc(dev_queue, new);
697 if (new && i > 0)
698 atomic_inc(&new->refcnt);
699
036d6a67
JP
700 if (!ingress)
701 qdisc_destroy(old);
1da177e4 702 }
99194cff 703
036d6a67 704 if (!ingress) {
7316ae88
TG
705 notify_and_destroy(net, skb, n, classid,
706 dev->qdisc, new);
036d6a67
JP
707 if (new && !new->ops->attach)
708 atomic_inc(&new->refcnt);
709 dev->qdisc = new ? : &noop_qdisc;
710 } else {
7316ae88 711 notify_and_destroy(net, skb, n, classid, old, new);
036d6a67 712 }
af356afa 713
99194cff
DM
714 if (dev->flags & IFF_UP)
715 dev_activate(dev);
1da177e4 716 } else {
20fea08b 717 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1da177e4 718
c9f1d038
PM
719 err = -EOPNOTSUPP;
720 if (cops && cops->graft) {
1da177e4
LT
721 unsigned long cl = cops->get(parent, classid);
722 if (cl) {
99194cff 723 err = cops->graft(parent, cl, new, &old);
1da177e4 724 cops->put(parent, cl);
c9f1d038
PM
725 } else
726 err = -ENOENT;
1da177e4 727 }
99194cff 728 if (!err)
7316ae88 729 notify_and_destroy(net, skb, n, classid, old, new);
1da177e4
LT
730 }
731 return err;
732}
733
25bfcd5a
JP
734/* lockdep annotation is needed for ingress; egress gets it only for name */
735static struct lock_class_key qdisc_tx_lock;
736static struct lock_class_key qdisc_rx_lock;
737
1da177e4
LT
738/*
739 Allocate and initialize new qdisc.
740
741 Parameters are passed via opt.
742 */
743
744static struct Qdisc *
bb949fbd 745qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
23bcf634
PM
746 struct Qdisc *p, u32 parent, u32 handle,
747 struct nlattr **tca, int *errp)
1da177e4
LT
748{
749 int err;
1e90474c 750 struct nlattr *kind = tca[TCA_KIND];
1da177e4
LT
751 struct Qdisc *sch;
752 struct Qdisc_ops *ops;
175f9c1b 753 struct qdisc_size_table *stab;
1da177e4
LT
754
755 ops = qdisc_lookup_ops(kind);
95a5afca 756#ifdef CONFIG_MODULES
1da177e4
LT
757 if (ops == NULL && kind != NULL) {
758 char name[IFNAMSIZ];
1e90474c 759 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1da177e4
LT
760 /* We dropped the RTNL semaphore in order to
761 * perform the module load. So, even if we
762 * succeeded in loading the module we have to
763 * tell the caller to replay the request. We
764 * indicate this using -EAGAIN.
765 * We replay the request because the device may
766 * go away in the mean time.
767 */
768 rtnl_unlock();
769 request_module("sch_%s", name);
770 rtnl_lock();
771 ops = qdisc_lookup_ops(kind);
772 if (ops != NULL) {
773 /* We will try again qdisc_lookup_ops,
774 * so don't keep a reference.
775 */
776 module_put(ops->owner);
777 err = -EAGAIN;
778 goto err_out;
779 }
780 }
781 }
782#endif
783
b9e2cc0f 784 err = -ENOENT;
1da177e4
LT
785 if (ops == NULL)
786 goto err_out;
787
5ce2d488 788 sch = qdisc_alloc(dev_queue, ops);
3d54b82f
TG
789 if (IS_ERR(sch)) {
790 err = PTR_ERR(sch);
1da177e4 791 goto err_out2;
3d54b82f 792 }
1da177e4 793
ffc8fefa
PM
794 sch->parent = parent;
795
3d54b82f 796 if (handle == TC_H_INGRESS) {
1da177e4 797 sch->flags |= TCQ_F_INGRESS;
3d54b82f 798 handle = TC_H_MAKE(TC_H_INGRESS, 0);
25bfcd5a 799 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
fd44de7c 800 } else {
fd44de7c
PM
801 if (handle == 0) {
802 handle = qdisc_alloc_handle(dev);
803 err = -ENOMEM;
804 if (handle == 0)
805 goto err_out3;
806 }
25bfcd5a 807 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1da177e4
LT
808 }
809
3d54b82f 810 sch->handle = handle;
1da177e4 811
1e90474c 812 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
175f9c1b
JK
813 if (tca[TCA_STAB]) {
814 stab = qdisc_get_stab(tca[TCA_STAB]);
815 if (IS_ERR(stab)) {
816 err = PTR_ERR(stab);
7c64b9f3 817 goto err_out4;
175f9c1b
JK
818 }
819 sch->stab = stab;
820 }
1e90474c 821 if (tca[TCA_RATE]) {
f6f9b93f
JP
822 spinlock_t *root_lock;
823
23bcf634
PM
824 err = -EOPNOTSUPP;
825 if (sch->flags & TCQ_F_MQROOT)
826 goto err_out4;
827
f6f9b93f 828 if ((sch->parent != TC_H_ROOT) &&
23bcf634
PM
829 !(sch->flags & TCQ_F_INGRESS) &&
830 (!p || !(p->flags & TCQ_F_MQROOT)))
f6f9b93f
JP
831 root_lock = qdisc_root_sleeping_lock(sch);
832 else
833 root_lock = qdisc_lock(sch);
834
023e09a7 835 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
f6f9b93f 836 root_lock, tca[TCA_RATE]);
23bcf634
PM
837 if (err)
838 goto err_out4;
023e09a7 839 }
f6e0b239
JP
840
841 qdisc_list_add(sch);
1da177e4 842
1da177e4
LT
843 return sch;
844 }
845err_out3:
846 dev_put(dev);
3d54b82f 847 kfree((char *) sch - sch->padded);
1da177e4
LT
848err_out2:
849 module_put(ops->owner);
850err_out:
851 *errp = err;
1da177e4 852 return NULL;
23bcf634
PM
853
854err_out4:
855 /*
856 * Any broken qdiscs that would require a ops->reset() here?
857 * The qdisc was never in action so it shouldn't be necessary.
858 */
7c64b9f3 859 qdisc_put_stab(sch->stab);
23bcf634
PM
860 if (ops->destroy)
861 ops->destroy(sch);
862 goto err_out3;
1da177e4
LT
863}
864
1e90474c 865static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1da177e4 866{
175f9c1b
JK
867 struct qdisc_size_table *stab = NULL;
868 int err = 0;
1da177e4 869
175f9c1b 870 if (tca[TCA_OPTIONS]) {
1da177e4
LT
871 if (sch->ops->change == NULL)
872 return -EINVAL;
1e90474c 873 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1da177e4
LT
874 if (err)
875 return err;
876 }
175f9c1b
JK
877
878 if (tca[TCA_STAB]) {
879 stab = qdisc_get_stab(tca[TCA_STAB]);
880 if (IS_ERR(stab))
881 return PTR_ERR(stab);
882 }
883
884 qdisc_put_stab(sch->stab);
885 sch->stab = stab;
886
23bcf634 887 if (tca[TCA_RATE]) {
71bcb09a
SH
888 /* NB: ignores errors from replace_estimator
889 because change can't be undone. */
23bcf634
PM
890 if (sch->flags & TCQ_F_MQROOT)
891 goto out;
1da177e4 892 gen_replace_estimator(&sch->bstats, &sch->rate_est,
71bcb09a
SH
893 qdisc_root_sleeping_lock(sch),
894 tca[TCA_RATE]);
23bcf634
PM
895 }
896out:
1da177e4
LT
897 return 0;
898}
899
900struct check_loop_arg
901{
902 struct qdisc_walker w;
903 struct Qdisc *p;
904 int depth;
905};
906
907static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
908
909static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
910{
911 struct check_loop_arg arg;
912
913 if (q->ops->cl_ops == NULL)
914 return 0;
915
916 arg.w.stop = arg.w.skip = arg.w.count = 0;
917 arg.w.fn = check_loop_fn;
918 arg.depth = depth;
919 arg.p = p;
920 q->ops->cl_ops->walk(q, &arg.w);
921 return arg.w.stop ? -ELOOP : 0;
922}
923
924static int
925check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
926{
927 struct Qdisc *leaf;
20fea08b 928 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1da177e4
LT
929 struct check_loop_arg *arg = (struct check_loop_arg *)w;
930
931 leaf = cops->leaf(q, cl);
932 if (leaf) {
933 if (leaf == arg->p || arg->depth > 7)
934 return -ELOOP;
935 return check_loop(leaf, arg->p, arg->depth + 1);
936 }
937 return 0;
938}
939
940/*
941 * Delete/get qdisc.
942 */
943
944static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
945{
3b1e0a65 946 struct net *net = sock_net(skb->sk);
1da177e4 947 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 948 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
949 struct net_device *dev;
950 u32 clid = tcm->tcm_parent;
951 struct Qdisc *q = NULL;
952 struct Qdisc *p = NULL;
953 int err;
954
7316ae88 955 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
956 return -ENODEV;
957
1e90474c
PM
958 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
959 if (err < 0)
960 return err;
961
1da177e4
LT
962 if (clid) {
963 if (clid != TC_H_ROOT) {
964 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
965 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
966 return -ENOENT;
967 q = qdisc_leaf(p, clid);
968 } else { /* ingress */
8123b421 969 q = dev->rx_queue.qdisc_sleeping;
10297b99 970 }
1da177e4 971 } else {
af356afa 972 q = dev->qdisc;
1da177e4
LT
973 }
974 if (!q)
975 return -ENOENT;
976
977 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
978 return -EINVAL;
979 } else {
980 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
981 return -ENOENT;
982 }
983
1e90474c 984 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
985 return -EINVAL;
986
987 if (n->nlmsg_type == RTM_DELQDISC) {
988 if (!clid)
989 return -EINVAL;
990 if (q->handle == 0)
991 return -ENOENT;
99194cff 992 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1da177e4 993 return err;
1da177e4 994 } else {
7316ae88 995 qdisc_notify(net, skb, n, clid, NULL, q);
1da177e4
LT
996 }
997 return 0;
998}
999
1000/*
1001 Create/change qdisc.
1002 */
1003
1004static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1005{
3b1e0a65 1006 struct net *net = sock_net(skb->sk);
1da177e4 1007 struct tcmsg *tcm;
1e90474c 1008 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1009 struct net_device *dev;
1010 u32 clid;
1011 struct Qdisc *q, *p;
1012 int err;
1013
1014replay:
1015 /* Reinit, just in case something touches this. */
1016 tcm = NLMSG_DATA(n);
1da177e4
LT
1017 clid = tcm->tcm_parent;
1018 q = p = NULL;
1019
7316ae88 1020 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1021 return -ENODEV;
1022
1e90474c
PM
1023 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1024 if (err < 0)
1025 return err;
1026
1da177e4
LT
1027 if (clid) {
1028 if (clid != TC_H_ROOT) {
1029 if (clid != TC_H_INGRESS) {
1030 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1031 return -ENOENT;
1032 q = qdisc_leaf(p, clid);
1033 } else { /*ingress */
8123b421 1034 q = dev->rx_queue.qdisc_sleeping;
1da177e4
LT
1035 }
1036 } else {
af356afa 1037 q = dev->qdisc;
1da177e4
LT
1038 }
1039
1040 /* It may be default qdisc, ignore it */
1041 if (q && q->handle == 0)
1042 q = NULL;
1043
1044 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1045 if (tcm->tcm_handle) {
1046 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1047 return -EEXIST;
1048 if (TC_H_MIN(tcm->tcm_handle))
1049 return -EINVAL;
1050 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1051 goto create_n_graft;
1052 if (n->nlmsg_flags&NLM_F_EXCL)
1053 return -EEXIST;
1e90474c 1054 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1055 return -EINVAL;
1056 if (q == p ||
1057 (p && check_loop(q, p, 0)))
1058 return -ELOOP;
1059 atomic_inc(&q->refcnt);
1060 goto graft;
1061 } else {
1062 if (q == NULL)
1063 goto create_n_graft;
1064
1065 /* This magic test requires explanation.
1066 *
1067 * We know, that some child q is already
1068 * attached to this parent and have choice:
1069 * either to change it or to create/graft new one.
1070 *
1071 * 1. We are allowed to create/graft only
1072 * if CREATE and REPLACE flags are set.
1073 *
1074 * 2. If EXCL is set, requestor wanted to say,
1075 * that qdisc tcm_handle is not expected
1076 * to exist, so that we choose create/graft too.
1077 *
1078 * 3. The last case is when no flags are set.
1079 * Alas, it is sort of hole in API, we
1080 * cannot decide what to do unambiguously.
1081 * For now we select create/graft, if
1082 * user gave KIND, which does not match existing.
1083 */
1084 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1085 (n->nlmsg_flags&NLM_F_REPLACE) &&
1086 ((n->nlmsg_flags&NLM_F_EXCL) ||
1e90474c
PM
1087 (tca[TCA_KIND] &&
1088 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1da177e4
LT
1089 goto create_n_graft;
1090 }
1091 }
1092 } else {
1093 if (!tcm->tcm_handle)
1094 return -EINVAL;
1095 q = qdisc_lookup(dev, tcm->tcm_handle);
1096 }
1097
1098 /* Change qdisc parameters */
1099 if (q == NULL)
1100 return -ENOENT;
1101 if (n->nlmsg_flags&NLM_F_EXCL)
1102 return -EEXIST;
1e90474c 1103 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1104 return -EINVAL;
1105 err = qdisc_change(q, tca);
1106 if (err == 0)
7316ae88 1107 qdisc_notify(net, skb, n, clid, NULL, q);
1da177e4
LT
1108 return err;
1109
1110create_n_graft:
1111 if (!(n->nlmsg_flags&NLM_F_CREATE))
1112 return -ENOENT;
1113 if (clid == TC_H_INGRESS)
23bcf634 1114 q = qdisc_create(dev, &dev->rx_queue, p,
bb949fbd 1115 tcm->tcm_parent, tcm->tcm_parent,
ffc8fefa 1116 tca, &err);
6ec1c69a 1117 else {
926e61b7 1118 struct netdev_queue *dev_queue;
6ec1c69a
DM
1119
1120 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
926e61b7
JP
1121 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1122 else if (p)
1123 dev_queue = p->dev_queue;
1124 else
1125 dev_queue = netdev_get_tx_queue(dev, 0);
6ec1c69a 1126
926e61b7 1127 q = qdisc_create(dev, dev_queue, p,
bb949fbd 1128 tcm->tcm_parent, tcm->tcm_handle,
ffc8fefa 1129 tca, &err);
6ec1c69a 1130 }
1da177e4
LT
1131 if (q == NULL) {
1132 if (err == -EAGAIN)
1133 goto replay;
1134 return err;
1135 }
1136
1137graft:
e5befbd9
IJ
1138 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1139 if (err) {
1140 if (q)
1141 qdisc_destroy(q);
1142 return err;
1da177e4 1143 }
e5befbd9 1144
1da177e4
LT
1145 return 0;
1146}
1147
1148static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 1149 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1150{
1151 struct tcmsg *tcm;
1152 struct nlmsghdr *nlh;
27a884dc 1153 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1154 struct gnet_dump d;
1155
e431b8c0 1156 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1157 tcm = NLMSG_DATA(nlh);
1158 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
1159 tcm->tcm__pad1 = 0;
1160 tcm->tcm__pad2 = 0;
5ce2d488 1161 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1162 tcm->tcm_parent = clid;
1163 tcm->tcm_handle = q->handle;
1164 tcm->tcm_info = atomic_read(&q->refcnt);
57e1c487 1165 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1166 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1e90474c 1167 goto nla_put_failure;
1da177e4
LT
1168 q->qstats.qlen = q->q.qlen;
1169
175f9c1b
JK
1170 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1171 goto nla_put_failure;
1172
102396ae
JP
1173 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1174 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1175 goto nla_put_failure;
1da177e4
LT
1176
1177 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1e90474c 1178 goto nla_put_failure;
1da177e4
LT
1179
1180 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
d250a5f9 1181 gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1da177e4 1182 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1e90474c 1183 goto nla_put_failure;
10297b99 1184
1da177e4 1185 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1186 goto nla_put_failure;
10297b99 1187
27a884dc 1188 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1189 return skb->len;
1190
1191nlmsg_failure:
1e90474c 1192nla_put_failure:
dc5fc579 1193 nlmsg_trim(skb, b);
1da177e4
LT
1194 return -1;
1195}
1196
7316ae88
TG
1197static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1198 struct nlmsghdr *n, u32 clid,
1199 struct Qdisc *old, struct Qdisc *new)
1da177e4
LT
1200{
1201 struct sk_buff *skb;
1202 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1203
1204 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1205 if (!skb)
1206 return -ENOBUFS;
1207
1208 if (old && old->handle) {
1209 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1210 goto err_out;
1211 }
1212 if (new) {
1213 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1214 goto err_out;
1215 }
1216
1217 if (skb->len)
7316ae88 1218 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1219
1220err_out:
1221 kfree_skb(skb);
1222 return -EINVAL;
1223}
1224
30723673
DM
1225static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1226{
1227 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1228}
1229
1230static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1231 struct netlink_callback *cb,
1232 int *q_idx_p, int s_q_idx)
1233{
1234 int ret = 0, q_idx = *q_idx_p;
1235 struct Qdisc *q;
1236
1237 if (!root)
1238 return 0;
1239
1240 q = root;
1241 if (q_idx < s_q_idx) {
1242 q_idx++;
1243 } else {
1244 if (!tc_qdisc_dump_ignore(q) &&
1245 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1246 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1247 goto done;
1248 q_idx++;
1249 }
1250 list_for_each_entry(q, &root->list, list) {
1251 if (q_idx < s_q_idx) {
1252 q_idx++;
1253 continue;
1254 }
1255 if (!tc_qdisc_dump_ignore(q) &&
1256 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1257 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1258 goto done;
1259 q_idx++;
1260 }
1261
1262out:
1263 *q_idx_p = q_idx;
1264 return ret;
1265done:
1266 ret = -1;
1267 goto out;
1268}
1269
1da177e4
LT
1270static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1271{
3b1e0a65 1272 struct net *net = sock_net(skb->sk);
1da177e4
LT
1273 int idx, q_idx;
1274 int s_idx, s_q_idx;
1275 struct net_device *dev;
1da177e4
LT
1276
1277 s_idx = cb->args[0];
1278 s_q_idx = q_idx = cb->args[1];
f1e9016d 1279
1280 rcu_read_lock();
7562f876 1281 idx = 0;
7316ae88 1282 for_each_netdev_rcu(net, dev) {
30723673
DM
1283 struct netdev_queue *dev_queue;
1284
1da177e4 1285 if (idx < s_idx)
7562f876 1286 goto cont;
1da177e4
LT
1287 if (idx > s_idx)
1288 s_q_idx = 0;
1da177e4 1289 q_idx = 0;
30723673 1290
af356afa 1291 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
30723673
DM
1292 goto done;
1293
1294 dev_queue = &dev->rx_queue;
827ebd64 1295 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
30723673
DM
1296 goto done;
1297
7562f876
PE
1298cont:
1299 idx++;
1da177e4
LT
1300 }
1301
1302done:
f1e9016d 1303 rcu_read_unlock();
1da177e4
LT
1304
1305 cb->args[0] = idx;
1306 cb->args[1] = q_idx;
1307
1308 return skb->len;
1309}
1310
1311
1312
1313/************************************************
1314 * Traffic classes manipulation. *
1315 ************************************************/
1316
1317
1318
1319static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1320{
3b1e0a65 1321 struct net *net = sock_net(skb->sk);
1da177e4 1322 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 1323 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1324 struct net_device *dev;
1325 struct Qdisc *q = NULL;
20fea08b 1326 const struct Qdisc_class_ops *cops;
1da177e4
LT
1327 unsigned long cl = 0;
1328 unsigned long new_cl;
1329 u32 pid = tcm->tcm_parent;
1330 u32 clid = tcm->tcm_handle;
1331 u32 qid = TC_H_MAJ(clid);
1332 int err;
1333
7316ae88 1334 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1335 return -ENODEV;
1336
1e90474c
PM
1337 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1338 if (err < 0)
1339 return err;
1340
1da177e4
LT
1341 /*
1342 parent == TC_H_UNSPEC - unspecified parent.
1343 parent == TC_H_ROOT - class is root, which has no parent.
1344 parent == X:0 - parent is root class.
1345 parent == X:Y - parent is a node in hierarchy.
1346 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1347
1348 handle == 0:0 - generate handle from kernel pool.
1349 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1350 handle == X:Y - clear.
1351 handle == X:0 - root class.
1352 */
1353
1354 /* Step 1. Determine qdisc handle X:0 */
1355
1356 if (pid != TC_H_ROOT) {
1357 u32 qid1 = TC_H_MAJ(pid);
1358
1359 if (qid && qid1) {
1360 /* If both majors are known, they must be identical. */
1361 if (qid != qid1)
1362 return -EINVAL;
1363 } else if (qid1) {
1364 qid = qid1;
1365 } else if (qid == 0)
af356afa 1366 qid = dev->qdisc->handle;
1da177e4
LT
1367
1368 /* Now qid is genuine qdisc handle consistent
1369 both with parent and child.
1370
1371 TC_H_MAJ(pid) still may be unspecified, complete it now.
1372 */
1373 if (pid)
1374 pid = TC_H_MAKE(qid, pid);
1375 } else {
1376 if (qid == 0)
af356afa 1377 qid = dev->qdisc->handle;
1da177e4
LT
1378 }
1379
1380 /* OK. Locate qdisc */
10297b99 1381 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
1382 return -ENOENT;
1383
1384 /* An check that it supports classes */
1385 cops = q->ops->cl_ops;
1386 if (cops == NULL)
1387 return -EINVAL;
1388
1389 /* Now try to get class */
1390 if (clid == 0) {
1391 if (pid == TC_H_ROOT)
1392 clid = qid;
1393 } else
1394 clid = TC_H_MAKE(qid, clid);
1395
1396 if (clid)
1397 cl = cops->get(q, clid);
1398
1399 if (cl == 0) {
1400 err = -ENOENT;
1401 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1402 goto out;
1403 } else {
1404 switch (n->nlmsg_type) {
10297b99 1405 case RTM_NEWTCLASS:
1da177e4
LT
1406 err = -EEXIST;
1407 if (n->nlmsg_flags&NLM_F_EXCL)
1408 goto out;
1409 break;
1410 case RTM_DELTCLASS:
de6d5cdf
PM
1411 err = -EOPNOTSUPP;
1412 if (cops->delete)
1413 err = cops->delete(q, cl);
1da177e4 1414 if (err == 0)
7316ae88 1415 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1da177e4
LT
1416 goto out;
1417 case RTM_GETTCLASS:
7316ae88 1418 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1da177e4
LT
1419 goto out;
1420 default:
1421 err = -EINVAL;
1422 goto out;
1423 }
1424 }
1425
1426 new_cl = cl;
de6d5cdf
PM
1427 err = -EOPNOTSUPP;
1428 if (cops->change)
1429 err = cops->change(q, clid, pid, tca, &new_cl);
1da177e4 1430 if (err == 0)
7316ae88 1431 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1da177e4
LT
1432
1433out:
1434 if (cl)
1435 cops->put(q, cl);
1436
1437 return err;
1438}
1439
1440
1441static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1442 unsigned long cl,
e431b8c0 1443 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1444{
1445 struct tcmsg *tcm;
1446 struct nlmsghdr *nlh;
27a884dc 1447 unsigned char *b = skb_tail_pointer(skb);
1da177e4 1448 struct gnet_dump d;
20fea08b 1449 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1da177e4 1450
e431b8c0 1451 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1452 tcm = NLMSG_DATA(nlh);
1453 tcm->tcm_family = AF_UNSPEC;
16ebb5e0
ED
1454 tcm->tcm__pad1 = 0;
1455 tcm->tcm__pad2 = 0;
5ce2d488 1456 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1457 tcm->tcm_parent = q->handle;
1458 tcm->tcm_handle = q->handle;
1459 tcm->tcm_info = 0;
57e1c487 1460 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1461 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1e90474c 1462 goto nla_put_failure;
1da177e4 1463
102396ae
JP
1464 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1465 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1466 goto nla_put_failure;
1da177e4
LT
1467
1468 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1e90474c 1469 goto nla_put_failure;
1da177e4
LT
1470
1471 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1472 goto nla_put_failure;
1da177e4 1473
27a884dc 1474 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1475 return skb->len;
1476
1477nlmsg_failure:
1e90474c 1478nla_put_failure:
dc5fc579 1479 nlmsg_trim(skb, b);
1da177e4
LT
1480 return -1;
1481}
1482
7316ae88
TG
1483static int tclass_notify(struct net *net, struct sk_buff *oskb,
1484 struct nlmsghdr *n, struct Qdisc *q,
1485 unsigned long cl, int event)
1da177e4
LT
1486{
1487 struct sk_buff *skb;
1488 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1489
1490 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1491 if (!skb)
1492 return -ENOBUFS;
1493
1494 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1495 kfree_skb(skb);
1496 return -EINVAL;
1497 }
1498
7316ae88 1499 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1500}
1501
1502struct qdisc_dump_args
1503{
1504 struct qdisc_walker w;
1505 struct sk_buff *skb;
1506 struct netlink_callback *cb;
1507};
1508
1509static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1510{
1511 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1512
1513 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1514 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1515}
1516
30723673
DM
1517static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1518 struct tcmsg *tcm, struct netlink_callback *cb,
1519 int *t_p, int s_t)
1520{
1521 struct qdisc_dump_args arg;
1522
1523 if (tc_qdisc_dump_ignore(q) ||
1524 *t_p < s_t || !q->ops->cl_ops ||
1525 (tcm->tcm_parent &&
1526 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1527 (*t_p)++;
1528 return 0;
1529 }
1530 if (*t_p > s_t)
1531 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1532 arg.w.fn = qdisc_class_dump;
1533 arg.skb = skb;
1534 arg.cb = cb;
1535 arg.w.stop = 0;
1536 arg.w.skip = cb->args[1];
1537 arg.w.count = 0;
1538 q->ops->cl_ops->walk(q, &arg.w);
1539 cb->args[1] = arg.w.count;
1540 if (arg.w.stop)
1541 return -1;
1542 (*t_p)++;
1543 return 0;
1544}
1545
1546static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1547 struct tcmsg *tcm, struct netlink_callback *cb,
1548 int *t_p, int s_t)
1549{
1550 struct Qdisc *q;
1551
1552 if (!root)
1553 return 0;
1554
1555 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1556 return -1;
1557
1558 list_for_each_entry(q, &root->list, list) {
1559 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1560 return -1;
1561 }
1562
1563 return 0;
1564}
1565
1da177e4
LT
1566static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1567{
30723673 1568 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
3b1e0a65 1569 struct net *net = sock_net(skb->sk);
30723673 1570 struct netdev_queue *dev_queue;
1da177e4 1571 struct net_device *dev;
30723673 1572 int t, s_t;
1da177e4
LT
1573
1574 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1575 return 0;
7316ae88 1576 if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1577 return 0;
1578
1579 s_t = cb->args[0];
1580 t = 0;
1581
af356afa 1582 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
30723673
DM
1583 goto done;
1584
1585 dev_queue = &dev->rx_queue;
8123b421 1586 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
30723673 1587 goto done;
1da177e4 1588
30723673 1589done:
1da177e4
LT
1590 cb->args[0] = t;
1591
1592 dev_put(dev);
1593 return skb->len;
1594}
1595
1596/* Main classifier routine: scans classifier chain attached
1597 to this qdisc, (optionally) tests for protocol and asks
1598 specific classifiers.
1599 */
73ca4918
PM
1600int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1601 struct tcf_result *res)
1602{
1603 __be16 protocol = skb->protocol;
1604 int err = 0;
1605
1606 for (; tp; tp = tp->next) {
1607 if ((tp->protocol == protocol ||
1608 tp->protocol == htons(ETH_P_ALL)) &&
1609 (err = tp->classify(skb, tp, res)) >= 0) {
1610#ifdef CONFIG_NET_CLS_ACT
1611 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1612 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1613#endif
1614 return err;
1615 }
1616 }
1617 return -1;
1618}
1619EXPORT_SYMBOL(tc_classify_compat);
1620
1da177e4 1621int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
73ca4918 1622 struct tcf_result *res)
1da177e4
LT
1623{
1624 int err = 0;
73ca4918 1625 __be16 protocol;
1da177e4
LT
1626#ifdef CONFIG_NET_CLS_ACT
1627 struct tcf_proto *otp = tp;
1628reclassify:
1629#endif
1630 protocol = skb->protocol;
1631
73ca4918 1632 err = tc_classify_compat(skb, tp, res);
1da177e4 1633#ifdef CONFIG_NET_CLS_ACT
73ca4918
PM
1634 if (err == TC_ACT_RECLASSIFY) {
1635 u32 verd = G_TC_VERD(skb->tc_verd);
1636 tp = otp;
1637
1638 if (verd++ >= MAX_REC_LOOP) {
1639 printk("rule prio %u protocol %02x reclassify loop, "
1640 "packet dropped\n",
1641 tp->prio&0xffff, ntohs(tp->protocol));
1642 return TC_ACT_SHOT;
1da177e4 1643 }
73ca4918
PM
1644 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1645 goto reclassify;
1da177e4 1646 }
73ca4918
PM
1647#endif
1648 return err;
1da177e4 1649}
73ca4918 1650EXPORT_SYMBOL(tc_classify);
1da177e4 1651
a48b5a61
PM
1652void tcf_destroy(struct tcf_proto *tp)
1653{
1654 tp->ops->destroy(tp);
1655 module_put(tp->ops->owner);
1656 kfree(tp);
1657}
1658
ff31ab56 1659void tcf_destroy_chain(struct tcf_proto **fl)
a48b5a61
PM
1660{
1661 struct tcf_proto *tp;
1662
ff31ab56
PM
1663 while ((tp = *fl) != NULL) {
1664 *fl = tp->next;
a48b5a61
PM
1665 tcf_destroy(tp);
1666 }
1667}
1668EXPORT_SYMBOL(tcf_destroy_chain);
1669
1da177e4
LT
1670#ifdef CONFIG_PROC_FS
1671static int psched_show(struct seq_file *seq, void *v)
1672{
3c0cfc13
PM
1673 struct timespec ts;
1674
1675 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1da177e4 1676 seq_printf(seq, "%08x %08x %08x %08x\n",
ca44d6e6 1677 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
514bca32 1678 1000000,
3c0cfc13 1679 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1da177e4
LT
1680
1681 return 0;
1682}
1683
1684static int psched_open(struct inode *inode, struct file *file)
1685{
1686 return single_open(file, psched_show, PDE(inode)->data);
1687}
1688
da7071d7 1689static const struct file_operations psched_fops = {
1da177e4
LT
1690 .owner = THIS_MODULE,
1691 .open = psched_open,
1692 .read = seq_read,
1693 .llseek = seq_lseek,
1694 .release = single_release,
10297b99 1695};
7316ae88
TG
1696
1697static int __net_init psched_net_init(struct net *net)
1698{
1699 struct proc_dir_entry *e;
1700
1701 e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1702 if (e == NULL)
1703 return -ENOMEM;
1704
1705 return 0;
1706}
1707
1708static void __net_exit psched_net_exit(struct net *net)
1709{
1710 proc_net_remove(net, "psched");
1711
1712 return;
1713}
1714#else
1715static int __net_init psched_net_init(struct net *net)
1716{
1717 return 0;
1718}
1719
1720static void __net_exit psched_net_exit(struct net *net)
1721{
1722}
1da177e4
LT
1723#endif
1724
7316ae88
TG
1725static struct pernet_operations psched_net_ops = {
1726 .init = psched_net_init,
1727 .exit = psched_net_exit,
1728};
1729
1da177e4
LT
1730static int __init pktsched_init(void)
1731{
7316ae88
TG
1732 int err;
1733
1734 err = register_pernet_subsys(&psched_net_ops);
1735 if (err) {
1736 printk(KERN_ERR "pktsched_init: "
1737 "cannot initialize per netns operations\n");
1738 return err;
1739 }
1740
1da177e4
LT
1741 register_qdisc(&pfifo_qdisc_ops);
1742 register_qdisc(&bfifo_qdisc_ops);
57dbb2d8 1743 register_qdisc(&pfifo_head_drop_qdisc_ops);
6ec1c69a 1744 register_qdisc(&mq_qdisc_ops);
1da177e4 1745
be577ddc
TG
1746 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1747 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1748 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1749 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1750 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1751 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1752
1da177e4
LT
1753 return 0;
1754}
1755
1756subsys_initcall(pktsched_init);