]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/sched/sch_cbq.c
Merge tag 'vfio-v4.12-rc1' of git://github.com/awilliam/linux-vfio
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_cbq.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
1da177e4 13#include <linux/module.h>
5a0e3ad6 14#include <linux/slab.h>
1da177e4
LT
15#include <linux/types.h>
16#include <linux/kernel.h>
1da177e4 17#include <linux/string.h>
1da177e4 18#include <linux/errno.h>
1da177e4 19#include <linux/skbuff.h>
0ba48053 20#include <net/netlink.h>
1da177e4 21#include <net/pkt_sched.h>
cf1facda 22#include <net/pkt_cls.h>
1da177e4
LT
23
24
25/* Class-Based Queueing (CBQ) algorithm.
26 =======================================
27
28 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
10297b99 29 Management Models for Packet Networks",
1da177e4
LT
30 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
31
10297b99 32 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
1da177e4 33
10297b99 34 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
1da177e4
LT
35 Parameters", 1996
36
37 [4] Sally Floyd and Michael Speer, "Experimental Results
38 for Class-Based Queueing", 1998, not published.
39
40 -----------------------------------------------------------------------
41
42 Algorithm skeleton was taken from NS simulator cbq.cc.
43 If someone wants to check this code against the LBL version,
44 he should take into account that ONLY the skeleton was borrowed,
45 the implementation is different. Particularly:
46
47 --- The WRR algorithm is different. Our version looks more
10297b99
YH
48 reasonable (I hope) and works when quanta are allowed to be
49 less than MTU, which is always the case when real time classes
50 have small rates. Note, that the statement of [3] is
51 incomplete, delay may actually be estimated even if class
52 per-round allotment is less than MTU. Namely, if per-round
53 allotment is W*r_i, and r_1+...+r_k = r < 1
1da177e4
LT
54
55 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
56
57 In the worst case we have IntServ estimate with D = W*r+k*MTU
58 and C = MTU*r. The proof (if correct at all) is trivial.
59
60
61 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
62 interpret some places, which look like wrong translations
63 from NS. Anyone is advised to find these differences
64 and explain to me, why I am wrong 8).
65
66 --- Linux has no EOI event, so that we cannot estimate true class
67 idle time. Workaround is to consider the next dequeue event
68 as sign that previous packet is finished. This is wrong because of
69 internal device queueing, but on a permanently loaded link it is true.
70 Moreover, combined with clock integrator, this scheme looks
71 very close to an ideal solution. */
72
73struct cbq_sched_data;
74
75
cc7ec456 76struct cbq_class {
d77fea2e 77 struct Qdisc_class_common common;
1da177e4
LT
78 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79
80/* Parameters */
1da177e4
LT
81 unsigned char priority; /* class priority */
82 unsigned char priority2; /* priority to be used after overlimit */
83 unsigned char ewma_log; /* time constant for idle time calculation */
1da177e4
LT
84
85 u32 defmap;
86
87 /* Link-sharing scheduler parameters */
88 long maxidle; /* Class parameters: see below. */
89 long offtime;
90 long minidle;
91 u32 avpkt;
92 struct qdisc_rate_table *R_tab;
93
1da177e4
LT
94 /* General scheduler (WRR) parameters */
95 long allot;
96 long quantum; /* Allotment per WRR round */
97 long weight; /* Relative allotment: see below */
98
99 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
100 struct cbq_class *split; /* Ptr to split node */
101 struct cbq_class *share; /* Ptr to LS parent in the class tree */
102 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
103 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
104 parent otherwise */
105 struct cbq_class *sibling; /* Sibling chain */
106 struct cbq_class *children; /* Pointer to children chain */
107
108 struct Qdisc *q; /* Elementary queueing discipline */
109
110
111/* Variables */
112 unsigned char cpriority; /* Effective priority */
113 unsigned char delayed;
114 unsigned char level; /* level of the class in hierarchy:
115 0 for leaf classes, and maximal
116 level of children + 1 for nodes.
117 */
118
119 psched_time_t last; /* Last end of service */
120 psched_time_t undertime;
121 long avgidle;
122 long deficit; /* Saved deficit for WRR */
1a13cb63 123 psched_time_t penalized;
c1a8f1f1 124 struct gnet_stats_basic_packed bstats;
1da177e4 125 struct gnet_stats_queue qstats;
1c0d32fd 126 struct net_rate_estimator __rcu *rate_est;
1da177e4
LT
127 struct tc_cbq_xstats xstats;
128
25d8c0d5 129 struct tcf_proto __rcu *filter_list;
1da177e4
LT
130
131 int refcnt;
132 int filters;
133
cc7ec456 134 struct cbq_class *defaults[TC_PRIO_MAX + 1];
1da177e4
LT
135};
136
cc7ec456 137struct cbq_sched_data {
d77fea2e 138 struct Qdisc_class_hash clhash; /* Hash table of all classes */
cc7ec456
ED
139 int nclasses[TC_CBQ_MAXPRIO + 1];
140 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
1da177e4
LT
141
142 struct cbq_class link;
143
cc7ec456
ED
144 unsigned int activemask;
145 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
1da177e4
LT
146 with backlog */
147
c3bc7cff 148#ifdef CONFIG_NET_CLS_ACT
1da177e4
LT
149 struct cbq_class *rx_class;
150#endif
151 struct cbq_class *tx_class;
152 struct cbq_class *tx_borrowed;
153 int tx_len;
154 psched_time_t now; /* Cached timestamp */
cc7ec456 155 unsigned int pmask;
1da177e4 156
2fbd3da3 157 struct hrtimer delay_timer;
88a99354 158 struct qdisc_watchdog watchdog; /* Watchdog timer,
1da177e4
LT
159 started when CBQ has
160 backlog, but cannot
161 transmit just now */
88a99354 162 psched_tdiff_t wd_expires;
1da177e4
LT
163 int toplevel;
164 u32 hgenerator;
165};
166
167
cc7ec456 168#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
1da177e4 169
cc7ec456 170static inline struct cbq_class *
1da177e4
LT
171cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
172{
d77fea2e 173 struct Qdisc_class_common *clc;
1da177e4 174
d77fea2e
PM
175 clc = qdisc_class_find(&q->clhash, classid);
176 if (clc == NULL)
177 return NULL;
178 return container_of(clc, struct cbq_class, common);
1da177e4
LT
179}
180
c3bc7cff 181#ifdef CONFIG_NET_CLS_ACT
1da177e4
LT
182
183static struct cbq_class *
184cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
185{
cc7ec456 186 struct cbq_class *cl;
1da177e4 187
cc7ec456
ED
188 for (cl = this->tparent; cl; cl = cl->tparent) {
189 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
1da177e4 190
cc7ec456
ED
191 if (new != NULL && new != this)
192 return new;
193 }
1da177e4
LT
194 return NULL;
195}
196
197#endif
198
199/* Classify packet. The procedure is pretty complicated, but
cc7ec456
ED
200 * it allows us to combine link sharing and priority scheduling
201 * transparently.
202 *
203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
204 * so that it resolves to split nodes. Then packets are classified
205 * by logical priority, or a more specific classifier may be attached
206 * to the split node.
1da177e4
LT
207 */
208
209static struct cbq_class *
210cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
211{
212 struct cbq_sched_data *q = qdisc_priv(sch);
213 struct cbq_class *head = &q->link;
214 struct cbq_class **defmap;
215 struct cbq_class *cl = NULL;
216 u32 prio = skb->priority;
25d8c0d5 217 struct tcf_proto *fl;
1da177e4
LT
218 struct tcf_result res;
219
220 /*
221 * Step 1. If skb->priority points to one of our classes, use it.
222 */
cc7ec456 223 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
1da177e4
LT
224 (cl = cbq_class_lookup(q, prio)) != NULL)
225 return cl;
226
c27f339a 227 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1da177e4
LT
228 for (;;) {
229 int result = 0;
230 defmap = head->defaults;
231
25d8c0d5 232 fl = rcu_dereference_bh(head->filter_list);
1da177e4
LT
233 /*
234 * Step 2+n. Apply classifier.
235 */
3b3ae880 236 result = tc_classify(skb, fl, &res, true);
25d8c0d5 237 if (!fl || result < 0)
1da177e4
LT
238 goto fallback;
239
cc7ec456
ED
240 cl = (void *)res.class;
241 if (!cl) {
1da177e4
LT
242 if (TC_H_MAJ(res.classid))
243 cl = cbq_class_lookup(q, res.classid);
cc7ec456 244 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
1da177e4
LT
245 cl = defmap[TC_PRIO_BESTEFFORT];
246
bdfc87f7 247 if (cl == NULL)
1da177e4
LT
248 goto fallback;
249 }
bdfc87f7
ED
250 if (cl->level >= head->level)
251 goto fallback;
1da177e4
LT
252#ifdef CONFIG_NET_CLS_ACT
253 switch (result) {
254 case TC_ACT_QUEUED:
10297b99 255 case TC_ACT_STOLEN:
378a2f09 256 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1da177e4
LT
257 case TC_ACT_SHOT:
258 return NULL;
73ca4918
PM
259 case TC_ACT_RECLASSIFY:
260 return cbq_reclassify(skb, cl);
1da177e4 261 }
1da177e4
LT
262#endif
263 if (cl->level == 0)
264 return cl;
265
266 /*
267 * Step 3+n. If classifier selected a link sharing class,
268 * apply agency specific classifier.
269 * Repeat this procdure until we hit a leaf node.
270 */
271 head = cl;
272 }
273
274fallback:
275 cl = head;
276
277 /*
278 * Step 4. No success...
279 */
280 if (TC_H_MAJ(prio) == 0 &&
cc7ec456 281 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
1da177e4
LT
282 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
283 return head;
284
285 return cl;
286}
287
288/*
cc7ec456
ED
289 * A packet has just been enqueued on the empty class.
290 * cbq_activate_class adds it to the tail of active class list
291 * of its priority band.
1da177e4
LT
292 */
293
cc7ec456 294static inline void cbq_activate_class(struct cbq_class *cl)
1da177e4
LT
295{
296 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
297 int prio = cl->cpriority;
298 struct cbq_class *cl_tail;
299
300 cl_tail = q->active[prio];
301 q->active[prio] = cl;
302
303 if (cl_tail != NULL) {
304 cl->next_alive = cl_tail->next_alive;
305 cl_tail->next_alive = cl;
306 } else {
307 cl->next_alive = cl;
308 q->activemask |= (1<<prio);
309 }
310}
311
312/*
cc7ec456
ED
313 * Unlink class from active chain.
314 * Note that this same procedure is done directly in cbq_dequeue*
315 * during round-robin procedure.
1da177e4
LT
316 */
317
318static void cbq_deactivate_class(struct cbq_class *this)
319{
320 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
321 int prio = this->cpriority;
322 struct cbq_class *cl;
323 struct cbq_class *cl_prev = q->active[prio];
324
325 do {
326 cl = cl_prev->next_alive;
327 if (cl == this) {
328 cl_prev->next_alive = cl->next_alive;
329 cl->next_alive = NULL;
330
331 if (cl == q->active[prio]) {
332 q->active[prio] = cl_prev;
333 if (cl == q->active[prio]) {
334 q->active[prio] = NULL;
335 q->activemask &= ~(1<<prio);
336 return;
337 }
338 }
1da177e4
LT
339 return;
340 }
341 } while ((cl_prev = cl) != q->active[prio]);
342}
343
344static void
345cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
346{
347 int toplevel = q->toplevel;
348
cca605dd 349 if (toplevel > cl->level) {
7201c1dd 350 psched_time_t now = psched_get_time();
1da177e4
LT
351
352 do {
104e0878 353 if (cl->undertime < now) {
1da177e4
LT
354 q->toplevel = cl->level;
355 return;
356 }
cc7ec456 357 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
1da177e4
LT
358 }
359}
360
361static int
520ac30f
ED
362cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
363 struct sk_buff **to_free)
1da177e4
LT
364{
365 struct cbq_sched_data *q = qdisc_priv(sch);
ddeee3ce 366 int uninitialized_var(ret);
1da177e4
LT
367 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
368
c3bc7cff 369#ifdef CONFIG_NET_CLS_ACT
1da177e4
LT
370 q->rx_class = cl;
371#endif
372 if (cl == NULL) {
c27f339a 373 if (ret & __NET_XMIT_BYPASS)
25331d6c 374 qdisc_qstats_drop(sch);
520ac30f 375 __qdisc_drop(skb, to_free);
1da177e4
LT
376 return ret;
377 }
378
520ac30f 379 ret = qdisc_enqueue(skb, cl->q, to_free);
5f86173b 380 if (ret == NET_XMIT_SUCCESS) {
1da177e4 381 sch->q.qlen++;
1da177e4
LT
382 cbq_mark_toplevel(q, cl);
383 if (!cl->next_alive)
384 cbq_activate_class(cl);
385 return ret;
386 }
387
378a2f09 388 if (net_xmit_drop_count(ret)) {
25331d6c 389 qdisc_qstats_drop(sch);
378a2f09
JP
390 cbq_mark_toplevel(q, cl);
391 cl->qstats.drops++;
392 }
1da177e4
LT
393 return ret;
394}
395
c3498d34
FW
396/* Overlimit action: penalize leaf class by adding offtime */
397static void cbq_overlimit(struct cbq_class *cl)
1da177e4
LT
398{
399 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
8edc0c31 400 psched_tdiff_t delay = cl->undertime - q->now;
1da177e4
LT
401
402 if (!cl->delayed) {
403 delay += cl->offtime;
404
10297b99 405 /*
cc7ec456
ED
406 * Class goes to sleep, so that it will have no
407 * chance to work avgidle. Let's forgive it 8)
408 *
409 * BTW cbq-2.0 has a crap in this
410 * place, apparently they forgot to shift it by cl->ewma_log.
1da177e4
LT
411 */
412 if (cl->avgidle < 0)
413 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
414 if (cl->avgidle < cl->minidle)
415 cl->avgidle = cl->minidle;
416 if (delay <= 0)
417 delay = 1;
7c59e25f 418 cl->undertime = q->now + delay;
1da177e4
LT
419
420 cl->xstats.overactions++;
421 cl->delayed = 1;
422 }
423 if (q->wd_expires == 0 || q->wd_expires > delay)
424 q->wd_expires = delay;
425
426 /* Dirty work! We must schedule wakeups based on
cc7ec456
ED
427 * real available rate, rather than leaf rate,
428 * which may be tiny (even zero).
1da177e4
LT
429 */
430 if (q->toplevel == TC_CBQ_MAXLEVEL) {
431 struct cbq_class *b;
432 psched_tdiff_t base_delay = q->wd_expires;
433
434 for (b = cl->borrow; b; b = b->borrow) {
8edc0c31 435 delay = b->undertime - q->now;
1da177e4
LT
436 if (delay < base_delay) {
437 if (delay <= 0)
438 delay = 1;
439 base_delay = delay;
440 }
441 }
442
443 q->wd_expires = base_delay;
444 }
445}
446
1a13cb63
PM
447static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
448 psched_time_t now)
1da177e4
LT
449{
450 struct cbq_class *cl;
451 struct cbq_class *cl_prev = q->active[prio];
1a13cb63 452 psched_time_t sched = now;
1da177e4
LT
453
454 if (cl_prev == NULL)
e9054a33 455 return 0;
1da177e4
LT
456
457 do {
458 cl = cl_prev->next_alive;
1a13cb63 459 if (now - cl->penalized > 0) {
1da177e4
LT
460 cl_prev->next_alive = cl->next_alive;
461 cl->next_alive = NULL;
462 cl->cpriority = cl->priority;
463 cl->delayed = 0;
464 cbq_activate_class(cl);
465
466 if (cl == q->active[prio]) {
467 q->active[prio] = cl_prev;
468 if (cl == q->active[prio]) {
469 q->active[prio] = NULL;
470 return 0;
471 }
472 }
473
474 cl = cl_prev->next_alive;
1a13cb63 475 } else if (sched - cl->penalized > 0)
1da177e4
LT
476 sched = cl->penalized;
477 } while ((cl_prev = cl) != q->active[prio]);
478
1a13cb63 479 return sched - now;
1da177e4
LT
480}
481
1a13cb63 482static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
1da177e4 483{
1a13cb63 484 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
2fbd3da3 485 delay_timer);
1a13cb63
PM
486 struct Qdisc *sch = q->watchdog.qdisc;
487 psched_time_t now;
488 psched_tdiff_t delay = 0;
cc7ec456 489 unsigned int pmask;
1da177e4 490
3bebcda2 491 now = psched_get_time();
1a13cb63 492
1da177e4
LT
493 pmask = q->pmask;
494 q->pmask = 0;
495
496 while (pmask) {
497 int prio = ffz(~pmask);
1a13cb63 498 psched_tdiff_t tmp;
1da177e4
LT
499
500 pmask &= ~(1<<prio);
501
1a13cb63 502 tmp = cbq_undelay_prio(q, prio, now);
1da177e4
LT
503 if (tmp > 0) {
504 q->pmask |= 1<<prio;
505 if (tmp < delay || delay == 0)
506 delay = tmp;
507 }
508 }
509
510 if (delay) {
1a13cb63
PM
511 ktime_t time;
512
8b0e1953 513 time = 0;
ca44d6e6 514 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
4a8e320c 515 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
1da177e4
LT
516 }
517
8608db03 518 __netif_schedule(qdisc_root(sch));
1a13cb63 519 return HRTIMER_NORESTART;
1da177e4
LT
520}
521
10297b99 522/*
cc7ec456
ED
523 * It is mission critical procedure.
524 *
525 * We "regenerate" toplevel cutoff, if transmitting class
526 * has backlog and it is not regulated. It is not part of
527 * original CBQ description, but looks more reasonable.
528 * Probably, it is wrong. This question needs further investigation.
529 */
1da177e4 530
cc7ec456 531static inline void
1da177e4
LT
532cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
533 struct cbq_class *borrowed)
534{
535 if (cl && q->toplevel >= borrowed->level) {
536 if (cl->q->q.qlen > 1) {
537 do {
a084980d 538 if (borrowed->undertime == PSCHED_PASTPERFECT) {
1da177e4
LT
539 q->toplevel = borrowed->level;
540 return;
541 }
cc7ec456 542 } while ((borrowed = borrowed->borrow) != NULL);
1da177e4 543 }
10297b99 544#if 0
1da177e4
LT
545 /* It is not necessary now. Uncommenting it
546 will save CPU cycles, but decrease fairness.
547 */
548 q->toplevel = TC_CBQ_MAXLEVEL;
549#endif
550 }
551}
552
553static void
554cbq_update(struct cbq_sched_data *q)
555{
556 struct cbq_class *this = q->tx_class;
557 struct cbq_class *cl = this;
558 int len = q->tx_len;
73d0f37a 559 psched_time_t now;
1da177e4
LT
560
561 q->tx_class = NULL;
73d0f37a
VA
562 /* Time integrator. We calculate EOS time
563 * by adding expected packet transmission time.
564 */
565 now = q->now + L2T(&q->link, len);
1da177e4
LT
566
567 for ( ; cl; cl = cl->share) {
568 long avgidle = cl->avgidle;
569 long idle;
570
571 cl->bstats.packets++;
572 cl->bstats.bytes += len;
573
574 /*
cc7ec456
ED
575 * (now - last) is total time between packet right edges.
576 * (last_pktlen/rate) is "virtual" busy time, so that
577 *
578 * idle = (now - last) - last_pktlen/rate
1da177e4
LT
579 */
580
73d0f37a 581 idle = now - cl->last;
1da177e4
LT
582 if ((unsigned long)idle > 128*1024*1024) {
583 avgidle = cl->maxidle;
584 } else {
585 idle -= L2T(cl, len);
586
587 /* true_avgidle := (1-W)*true_avgidle + W*idle,
cc7ec456
ED
588 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
589 * cl->avgidle == true_avgidle/W,
590 * hence:
1da177e4
LT
591 */
592 avgidle += idle - (avgidle>>cl->ewma_log);
593 }
594
595 if (avgidle <= 0) {
596 /* Overlimit or at-limit */
597
598 if (avgidle < cl->minidle)
599 avgidle = cl->minidle;
600
601 cl->avgidle = avgidle;
602
603 /* Calculate expected time, when this class
cc7ec456
ED
604 * will be allowed to send.
605 * It will occur, when:
606 * (1-W)*true_avgidle + W*delay = 0, i.e.
607 * idle = (1/W - 1)*(-true_avgidle)
608 * or
609 * idle = (1 - W)*(-cl->avgidle);
1da177e4
LT
610 */
611 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
612
613 /*
cc7ec456
ED
614 * That is not all.
615 * To maintain the rate allocated to the class,
616 * we add to undertime virtual clock,
617 * necessary to complete transmitted packet.
618 * (len/phys_bandwidth has been already passed
619 * to the moment of cbq_update)
1da177e4
LT
620 */
621
622 idle -= L2T(&q->link, len);
623 idle += L2T(cl, len);
624
73d0f37a 625 cl->undertime = now + idle;
1da177e4
LT
626 } else {
627 /* Underlimit */
628
a084980d 629 cl->undertime = PSCHED_PASTPERFECT;
1da177e4
LT
630 if (avgidle > cl->maxidle)
631 cl->avgidle = cl->maxidle;
632 else
633 cl->avgidle = avgidle;
634 }
73d0f37a
VA
635 if ((s64)(now - cl->last) > 0)
636 cl->last = now;
1da177e4
LT
637 }
638
639 cbq_update_toplevel(q, this, q->tx_borrowed);
640}
641
cc7ec456 642static inline struct cbq_class *
1da177e4
LT
643cbq_under_limit(struct cbq_class *cl)
644{
645 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
646 struct cbq_class *this_cl = cl;
647
648 if (cl->tparent == NULL)
649 return cl;
650
a084980d 651 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
1da177e4
LT
652 cl->delayed = 0;
653 return cl;
654 }
655
656 do {
657 /* It is very suspicious place. Now overlimit
cc7ec456
ED
658 * action is generated for not bounded classes
659 * only if link is completely congested.
660 * Though it is in agree with ancestor-only paradigm,
661 * it looks very stupid. Particularly,
662 * it means that this chunk of code will either
663 * never be called or result in strong amplification
664 * of burstiness. Dangerous, silly, and, however,
665 * no another solution exists.
1da177e4 666 */
cc7ec456
ED
667 cl = cl->borrow;
668 if (!cl) {
1da177e4 669 this_cl->qstats.overlimits++;
c3498d34 670 cbq_overlimit(this_cl);
1da177e4
LT
671 return NULL;
672 }
673 if (cl->level > q->toplevel)
674 return NULL;
a084980d 675 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
1da177e4
LT
676
677 cl->delayed = 0;
678 return cl;
679}
680
cc7ec456 681static inline struct sk_buff *
1da177e4
LT
682cbq_dequeue_prio(struct Qdisc *sch, int prio)
683{
684 struct cbq_sched_data *q = qdisc_priv(sch);
685 struct cbq_class *cl_tail, *cl_prev, *cl;
686 struct sk_buff *skb;
687 int deficit;
688
689 cl_tail = cl_prev = q->active[prio];
690 cl = cl_prev->next_alive;
691
692 do {
693 deficit = 0;
694
695 /* Start round */
696 do {
697 struct cbq_class *borrow = cl;
698
699 if (cl->q->q.qlen &&
700 (borrow = cbq_under_limit(cl)) == NULL)
701 goto skip_class;
702
703 if (cl->deficit <= 0) {
704 /* Class exhausted its allotment per
cc7ec456 705 * this round. Switch to the next one.
1da177e4
LT
706 */
707 deficit = 1;
708 cl->deficit += cl->quantum;
709 goto next_class;
710 }
711
712 skb = cl->q->dequeue(cl->q);
713
714 /* Class did not give us any skb :-(
cc7ec456
ED
715 * It could occur even if cl->q->q.qlen != 0
716 * f.e. if cl->q == "tbf"
1da177e4
LT
717 */
718 if (skb == NULL)
719 goto skip_class;
720
0abf77e5 721 cl->deficit -= qdisc_pkt_len(skb);
1da177e4
LT
722 q->tx_class = cl;
723 q->tx_borrowed = borrow;
724 if (borrow != cl) {
725#ifndef CBQ_XSTATS_BORROWS_BYTES
726 borrow->xstats.borrows++;
727 cl->xstats.borrows++;
728#else
0abf77e5
JK
729 borrow->xstats.borrows += qdisc_pkt_len(skb);
730 cl->xstats.borrows += qdisc_pkt_len(skb);
1da177e4
LT
731#endif
732 }
0abf77e5 733 q->tx_len = qdisc_pkt_len(skb);
1da177e4
LT
734
735 if (cl->deficit <= 0) {
736 q->active[prio] = cl;
737 cl = cl->next_alive;
738 cl->deficit += cl->quantum;
739 }
740 return skb;
741
742skip_class:
743 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
744 /* Class is empty or penalized.
cc7ec456 745 * Unlink it from active chain.
1da177e4
LT
746 */
747 cl_prev->next_alive = cl->next_alive;
748 cl->next_alive = NULL;
749
750 /* Did cl_tail point to it? */
751 if (cl == cl_tail) {
752 /* Repair it! */
753 cl_tail = cl_prev;
754
755 /* Was it the last class in this band? */
756 if (cl == cl_tail) {
757 /* Kill the band! */
758 q->active[prio] = NULL;
759 q->activemask &= ~(1<<prio);
760 if (cl->q->q.qlen)
761 cbq_activate_class(cl);
762 return NULL;
763 }
764
765 q->active[prio] = cl_tail;
766 }
767 if (cl->q->q.qlen)
768 cbq_activate_class(cl);
769
770 cl = cl_prev;
771 }
772
773next_class:
774 cl_prev = cl;
775 cl = cl->next_alive;
776 } while (cl_prev != cl_tail);
777 } while (deficit);
778
779 q->active[prio] = cl_prev;
780
781 return NULL;
782}
783
cc7ec456 784static inline struct sk_buff *
1da177e4
LT
785cbq_dequeue_1(struct Qdisc *sch)
786{
787 struct cbq_sched_data *q = qdisc_priv(sch);
788 struct sk_buff *skb;
cc7ec456 789 unsigned int activemask;
1da177e4 790
cc7ec456 791 activemask = q->activemask & 0xFF;
1da177e4
LT
792 while (activemask) {
793 int prio = ffz(~activemask);
794 activemask &= ~(1<<prio);
795 skb = cbq_dequeue_prio(sch, prio);
796 if (skb)
797 return skb;
798 }
799 return NULL;
800}
801
802static struct sk_buff *
803cbq_dequeue(struct Qdisc *sch)
804{
805 struct sk_buff *skb;
806 struct cbq_sched_data *q = qdisc_priv(sch);
807 psched_time_t now;
1da177e4 808
3bebcda2 809 now = psched_get_time();
73d0f37a
VA
810
811 if (q->tx_class)
1da177e4 812 cbq_update(q);
73d0f37a
VA
813
814 q->now = now;
1da177e4
LT
815
816 for (;;) {
817 q->wd_expires = 0;
818
819 skb = cbq_dequeue_1(sch);
820 if (skb) {
9190b3b3 821 qdisc_bstats_update(sch, skb);
1da177e4 822 sch->q.qlen--;
1da177e4
LT
823 return skb;
824 }
825
826 /* All the classes are overlimit.
cc7ec456
ED
827 *
828 * It is possible, if:
829 *
830 * 1. Scheduler is empty.
831 * 2. Toplevel cutoff inhibited borrowing.
832 * 3. Root class is overlimit.
833 *
834 * Reset 2d and 3d conditions and retry.
835 *
836 * Note, that NS and cbq-2.0 are buggy, peeking
837 * an arbitrary class is appropriate for ancestor-only
838 * sharing, but not for toplevel algorithm.
839 *
840 * Our version is better, but slower, because it requires
841 * two passes, but it is unavoidable with top-level sharing.
842 */
1da177e4
LT
843
844 if (q->toplevel == TC_CBQ_MAXLEVEL &&
a084980d 845 q->link.undertime == PSCHED_PASTPERFECT)
1da177e4
LT
846 break;
847
848 q->toplevel = TC_CBQ_MAXLEVEL;
a084980d 849 q->link.undertime = PSCHED_PASTPERFECT;
1da177e4
LT
850 }
851
852 /* No packets in scheduler or nobody wants to give them to us :-(
cc7ec456
ED
853 * Sigh... start watchdog timer in the last case.
854 */
1da177e4
LT
855
856 if (sch->q.qlen) {
25331d6c 857 qdisc_qstats_overlimit(sch);
88a99354
PM
858 if (q->wd_expires)
859 qdisc_watchdog_schedule(&q->watchdog,
bb239acf 860 now + q->wd_expires);
1da177e4
LT
861 }
862 return NULL;
863}
864
865/* CBQ class maintanance routines */
866
867static void cbq_adjust_levels(struct cbq_class *this)
868{
869 if (this == NULL)
870 return;
871
872 do {
873 int level = 0;
874 struct cbq_class *cl;
875
cc7ec456
ED
876 cl = this->children;
877 if (cl) {
1da177e4
LT
878 do {
879 if (cl->level > level)
880 level = cl->level;
881 } while ((cl = cl->sibling) != this->children);
882 }
cc7ec456 883 this->level = level + 1;
1da177e4
LT
884 } while ((this = this->tparent) != NULL);
885}
886
887static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
888{
889 struct cbq_class *cl;
d77fea2e 890 unsigned int h;
1da177e4
LT
891
892 if (q->quanta[prio] == 0)
893 return;
894
d77fea2e 895 for (h = 0; h < q->clhash.hashsize; h++) {
b67bfe0d 896 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1da177e4 897 /* BUGGGG... Beware! This expression suffer of
cc7ec456 898 * arithmetic overflows!
1da177e4
LT
899 */
900 if (cl->priority == prio) {
901 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
902 q->quanta[prio];
903 }
833fa743
YY
904 if (cl->quantum <= 0 ||
905 cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
c17988a9
YY
906 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
907 cl->common.classid, cl->quantum);
5ce2d488 908 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1da177e4
LT
909 }
910 }
911 }
912}
913
914static void cbq_sync_defmap(struct cbq_class *cl)
915{
916 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
917 struct cbq_class *split = cl->split;
cc7ec456 918 unsigned int h;
1da177e4
LT
919 int i;
920
921 if (split == NULL)
922 return;
923
cc7ec456
ED
924 for (i = 0; i <= TC_PRIO_MAX; i++) {
925 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1da177e4
LT
926 split->defaults[i] = NULL;
927 }
928
cc7ec456 929 for (i = 0; i <= TC_PRIO_MAX; i++) {
1da177e4
LT
930 int level = split->level;
931
932 if (split->defaults[i])
933 continue;
934
d77fea2e 935 for (h = 0; h < q->clhash.hashsize; h++) {
1da177e4
LT
936 struct cbq_class *c;
937
b67bfe0d 938 hlist_for_each_entry(c, &q->clhash.hash[h],
d77fea2e 939 common.hnode) {
1da177e4 940 if (c->split == split && c->level < level &&
cc7ec456 941 c->defmap & (1<<i)) {
1da177e4
LT
942 split->defaults[i] = c;
943 level = c->level;
944 }
945 }
946 }
947 }
948}
949
950static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
951{
952 struct cbq_class *split = NULL;
953
954 if (splitid == 0) {
cc7ec456
ED
955 split = cl->split;
956 if (!split)
1da177e4 957 return;
d77fea2e 958 splitid = split->common.classid;
1da177e4
LT
959 }
960
d77fea2e 961 if (split == NULL || split->common.classid != splitid) {
1da177e4 962 for (split = cl->tparent; split; split = split->tparent)
d77fea2e 963 if (split->common.classid == splitid)
1da177e4
LT
964 break;
965 }
966
967 if (split == NULL)
968 return;
969
970 if (cl->split != split) {
971 cl->defmap = 0;
972 cbq_sync_defmap(cl);
973 cl->split = split;
cc7ec456 974 cl->defmap = def & mask;
1da177e4 975 } else
cc7ec456 976 cl->defmap = (cl->defmap & ~mask) | (def & mask);
1da177e4
LT
977
978 cbq_sync_defmap(cl);
979}
980
981static void cbq_unlink_class(struct cbq_class *this)
982{
983 struct cbq_class *cl, **clp;
984 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
985
d77fea2e 986 qdisc_class_hash_remove(&q->clhash, &this->common);
1da177e4
LT
987
988 if (this->tparent) {
cc7ec456 989 clp = &this->sibling;
1da177e4
LT
990 cl = *clp;
991 do {
992 if (cl == this) {
993 *clp = cl->sibling;
994 break;
995 }
996 clp = &cl->sibling;
997 } while ((cl = *clp) != this->sibling);
998
999 if (this->tparent->children == this) {
1000 this->tparent->children = this->sibling;
1001 if (this->sibling == this)
1002 this->tparent->children = NULL;
1003 }
1004 } else {
547b792c 1005 WARN_ON(this->sibling != this);
1da177e4
LT
1006 }
1007}
1008
1009static void cbq_link_class(struct cbq_class *this)
1010{
1011 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1da177e4
LT
1012 struct cbq_class *parent = this->tparent;
1013
1014 this->sibling = this;
d77fea2e 1015 qdisc_class_hash_insert(&q->clhash, &this->common);
1da177e4
LT
1016
1017 if (parent == NULL)
1018 return;
1019
1020 if (parent->children == NULL) {
1021 parent->children = this;
1022 } else {
1023 this->sibling = parent->children->sibling;
1024 parent->children->sibling = this;
1025 }
1026}
1027
1da177e4 1028static void
cc7ec456 1029cbq_reset(struct Qdisc *sch)
1da177e4
LT
1030{
1031 struct cbq_sched_data *q = qdisc_priv(sch);
1032 struct cbq_class *cl;
1033 int prio;
cc7ec456 1034 unsigned int h;
1da177e4
LT
1035
1036 q->activemask = 0;
1037 q->pmask = 0;
1038 q->tx_class = NULL;
1039 q->tx_borrowed = NULL;
88a99354 1040 qdisc_watchdog_cancel(&q->watchdog);
2fbd3da3 1041 hrtimer_cancel(&q->delay_timer);
1da177e4 1042 q->toplevel = TC_CBQ_MAXLEVEL;
3bebcda2 1043 q->now = psched_get_time();
1da177e4
LT
1044
1045 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1046 q->active[prio] = NULL;
1047
d77fea2e 1048 for (h = 0; h < q->clhash.hashsize; h++) {
b67bfe0d 1049 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1da177e4
LT
1050 qdisc_reset(cl->q);
1051
1052 cl->next_alive = NULL;
a084980d 1053 cl->undertime = PSCHED_PASTPERFECT;
1da177e4
LT
1054 cl->avgidle = cl->maxidle;
1055 cl->deficit = cl->quantum;
1056 cl->cpriority = cl->priority;
1057 }
1058 }
1059 sch->q.qlen = 0;
1060}
1061
1062
1063static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1064{
cc7ec456
ED
1065 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1066 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1067 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1da177e4 1068 }
cc7ec456 1069 if (lss->change & TCF_CBQ_LSS_EWMA)
1da177e4 1070 cl->ewma_log = lss->ewma_log;
cc7ec456 1071 if (lss->change & TCF_CBQ_LSS_AVPKT)
1da177e4 1072 cl->avpkt = lss->avpkt;
cc7ec456 1073 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1da177e4 1074 cl->minidle = -(long)lss->minidle;
cc7ec456 1075 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1da177e4
LT
1076 cl->maxidle = lss->maxidle;
1077 cl->avgidle = lss->maxidle;
1078 }
cc7ec456 1079 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1da177e4
LT
1080 cl->offtime = lss->offtime;
1081 return 0;
1082}
1083
1084static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1085{
1086 q->nclasses[cl->priority]--;
1087 q->quanta[cl->priority] -= cl->weight;
1088 cbq_normalize_quanta(q, cl->priority);
1089}
1090
1091static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1092{
1093 q->nclasses[cl->priority]++;
1094 q->quanta[cl->priority] += cl->weight;
1095 cbq_normalize_quanta(q, cl->priority);
1096}
1097
1098static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1099{
1100 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1101
1102 if (wrr->allot)
1103 cl->allot = wrr->allot;
1104 if (wrr->weight)
1105 cl->weight = wrr->weight;
1106 if (wrr->priority) {
cc7ec456 1107 cl->priority = wrr->priority - 1;
1da177e4
LT
1108 cl->cpriority = cl->priority;
1109 if (cl->priority >= cl->priority2)
cc7ec456 1110 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1da177e4
LT
1111 }
1112
1113 cbq_addprio(q, cl);
1114 return 0;
1115}
1116
1da177e4
LT
1117static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1118{
1119 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1120 return 0;
1121}
1122
27a3421e
PM
1123static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1124 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1125 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1126 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1127 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1128 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1129 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1130 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1131};
1132
1e90474c 1133static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1da177e4
LT
1134{
1135 struct cbq_sched_data *q = qdisc_priv(sch);
1e90474c 1136 struct nlattr *tb[TCA_CBQ_MAX + 1];
1da177e4 1137 struct tc_ratespec *r;
cee63723
PM
1138 int err;
1139
fceb6435 1140 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
cee63723
PM
1141 if (err < 0)
1142 return err;
1da177e4 1143
27a3421e 1144 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1da177e4
LT
1145 return -EINVAL;
1146
1e90474c 1147 r = nla_data(tb[TCA_CBQ_RATE]);
1da177e4 1148
1e90474c 1149 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1da177e4
LT
1150 return -EINVAL;
1151
d77fea2e
PM
1152 err = qdisc_class_hash_init(&q->clhash);
1153 if (err < 0)
1154 goto put_rtab;
1155
1da177e4
LT
1156 q->link.refcnt = 1;
1157 q->link.sibling = &q->link;
d77fea2e 1158 q->link.common.classid = sch->handle;
1da177e4 1159 q->link.qdisc = sch;
3511c913
CG
1160 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1161 sch->handle);
1162 if (!q->link.q)
1da177e4 1163 q->link.q = &noop_qdisc;
49b49971
JK
1164 else
1165 qdisc_hash_add(q->link.q, true);
1da177e4 1166
cc7ec456
ED
1167 q->link.priority = TC_CBQ_MAXPRIO - 1;
1168 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1169 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
5ce2d488 1170 q->link.allot = psched_mtu(qdisc_dev(sch));
1da177e4
LT
1171 q->link.quantum = q->link.allot;
1172 q->link.weight = q->link.R_tab->rate.rate;
1173
1174 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1175 q->link.avpkt = q->link.allot/2;
1176 q->link.minidle = -0x7FFFFFFF;
1da177e4 1177
88a99354 1178 qdisc_watchdog_init(&q->watchdog, sch);
4a8e320c 1179 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1da177e4
LT
1180 q->delay_timer.function = cbq_undelay;
1181 q->toplevel = TC_CBQ_MAXLEVEL;
3bebcda2 1182 q->now = psched_get_time();
1da177e4
LT
1183
1184 cbq_link_class(&q->link);
1185
1e90474c
PM
1186 if (tb[TCA_CBQ_LSSOPT])
1187 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1da177e4
LT
1188
1189 cbq_addprio(q, &q->link);
1190 return 0;
d77fea2e
PM
1191
1192put_rtab:
1193 qdisc_put_rtab(q->link.R_tab);
1194 return err;
1da177e4
LT
1195}
1196
cc7ec456 1197static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1da177e4 1198{
27a884dc 1199 unsigned char *b = skb_tail_pointer(skb);
1da177e4 1200
1b34ec43
DM
1201 if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1202 goto nla_put_failure;
1da177e4
LT
1203 return skb->len;
1204
1e90474c 1205nla_put_failure:
dc5fc579 1206 nlmsg_trim(skb, b);
1da177e4
LT
1207 return -1;
1208}
1209
cc7ec456 1210static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1da177e4 1211{
27a884dc 1212 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1213 struct tc_cbq_lssopt opt;
1214
1215 opt.flags = 0;
1216 if (cl->borrow == NULL)
1217 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1218 if (cl->share == NULL)
1219 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1220 opt.ewma_log = cl->ewma_log;
1221 opt.level = cl->level;
1222 opt.avpkt = cl->avpkt;
1223 opt.maxidle = cl->maxidle;
1224 opt.minidle = (u32)(-cl->minidle);
1225 opt.offtime = cl->offtime;
1226 opt.change = ~0;
1b34ec43
DM
1227 if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1228 goto nla_put_failure;
1da177e4
LT
1229 return skb->len;
1230
1e90474c 1231nla_put_failure:
dc5fc579 1232 nlmsg_trim(skb, b);
1da177e4
LT
1233 return -1;
1234}
1235
cc7ec456 1236static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1da177e4 1237{
27a884dc 1238 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1239 struct tc_cbq_wrropt opt;
1240
a0db856a 1241 memset(&opt, 0, sizeof(opt));
1da177e4
LT
1242 opt.flags = 0;
1243 opt.allot = cl->allot;
cc7ec456
ED
1244 opt.priority = cl->priority + 1;
1245 opt.cpriority = cl->cpriority + 1;
1da177e4 1246 opt.weight = cl->weight;
1b34ec43
DM
1247 if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1248 goto nla_put_failure;
1da177e4
LT
1249 return skb->len;
1250
1e90474c 1251nla_put_failure:
dc5fc579 1252 nlmsg_trim(skb, b);
1da177e4
LT
1253 return -1;
1254}
1255
cc7ec456 1256static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1da177e4 1257{
27a884dc 1258 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1259 struct tc_cbq_fopt opt;
1260
1261 if (cl->split || cl->defmap) {
d77fea2e 1262 opt.split = cl->split ? cl->split->common.classid : 0;
1da177e4
LT
1263 opt.defmap = cl->defmap;
1264 opt.defchange = ~0;
1b34ec43
DM
1265 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1266 goto nla_put_failure;
1da177e4
LT
1267 }
1268 return skb->len;
1269
1e90474c 1270nla_put_failure:
dc5fc579 1271 nlmsg_trim(skb, b);
1da177e4
LT
1272 return -1;
1273}
1274
1da177e4
LT
1275static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1276{
1277 if (cbq_dump_lss(skb, cl) < 0 ||
1278 cbq_dump_rate(skb, cl) < 0 ||
1279 cbq_dump_wrr(skb, cl) < 0 ||
1da177e4
LT
1280 cbq_dump_fopt(skb, cl) < 0)
1281 return -1;
1282 return 0;
1283}
1284
1285static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1286{
1287 struct cbq_sched_data *q = qdisc_priv(sch);
4b3550ef 1288 struct nlattr *nest;
1da177e4 1289
4b3550ef
PM
1290 nest = nla_nest_start(skb, TCA_OPTIONS);
1291 if (nest == NULL)
1292 goto nla_put_failure;
1da177e4 1293 if (cbq_dump_attr(skb, &q->link) < 0)
1e90474c 1294 goto nla_put_failure;
d59b7d80 1295 return nla_nest_end(skb, nest);
1da177e4 1296
1e90474c 1297nla_put_failure:
4b3550ef 1298 nla_nest_cancel(skb, nest);
1da177e4
LT
1299 return -1;
1300}
1301
1302static int
1303cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1304{
1305 struct cbq_sched_data *q = qdisc_priv(sch);
1306
1307 q->link.xstats.avgidle = q->link.avgidle;
1308 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1309}
1310
1311static int
1312cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1313 struct sk_buff *skb, struct tcmsg *tcm)
1314{
cc7ec456 1315 struct cbq_class *cl = (struct cbq_class *)arg;
4b3550ef 1316 struct nlattr *nest;
1da177e4
LT
1317
1318 if (cl->tparent)
d77fea2e 1319 tcm->tcm_parent = cl->tparent->common.classid;
1da177e4
LT
1320 else
1321 tcm->tcm_parent = TC_H_ROOT;
d77fea2e 1322 tcm->tcm_handle = cl->common.classid;
1da177e4
LT
1323 tcm->tcm_info = cl->q->handle;
1324
4b3550ef
PM
1325 nest = nla_nest_start(skb, TCA_OPTIONS);
1326 if (nest == NULL)
1327 goto nla_put_failure;
1da177e4 1328 if (cbq_dump_attr(skb, cl) < 0)
1e90474c 1329 goto nla_put_failure;
d59b7d80 1330 return nla_nest_end(skb, nest);
1da177e4 1331
1e90474c 1332nla_put_failure:
4b3550ef 1333 nla_nest_cancel(skb, nest);
1da177e4
LT
1334 return -1;
1335}
1336
1337static int
1338cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1339 struct gnet_dump *d)
1340{
1341 struct cbq_sched_data *q = qdisc_priv(sch);
cc7ec456 1342 struct cbq_class *cl = (struct cbq_class *)arg;
1da177e4 1343
1da177e4
LT
1344 cl->xstats.avgidle = cl->avgidle;
1345 cl->xstats.undertime = 0;
1346
a084980d 1347 if (cl->undertime != PSCHED_PASTPERFECT)
8edc0c31 1348 cl->xstats.undertime = cl->undertime - q->now;
1da177e4 1349
edb09eb1
ED
1350 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1351 d, NULL, &cl->bstats) < 0 ||
1c0d32fd 1352 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
b0ab6f92 1353 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1da177e4
LT
1354 return -1;
1355
1356 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1357}
1358
1359static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1360 struct Qdisc **old)
1361{
cc7ec456 1362 struct cbq_class *cl = (struct cbq_class *)arg;
1da177e4 1363
5b9a9ccf 1364 if (new == NULL) {
3511c913 1365 new = qdisc_create_dflt(sch->dev_queue,
5b9a9ccf
PM
1366 &pfifo_qdisc_ops, cl->common.classid);
1367 if (new == NULL)
1368 return -ENOBUFS;
1da177e4 1369 }
5b9a9ccf 1370
86a7996c 1371 *old = qdisc_replace(sch, new, &cl->q);
5b9a9ccf 1372 return 0;
1da177e4
LT
1373}
1374
cc7ec456 1375static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1da177e4 1376{
cc7ec456 1377 struct cbq_class *cl = (struct cbq_class *)arg;
1da177e4 1378
5b9a9ccf 1379 return cl->q;
1da177e4
LT
1380}
1381
a37ef2e3
JP
1382static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1383{
1384 struct cbq_class *cl = (struct cbq_class *)arg;
1385
1386 if (cl->q->q.qlen == 0)
1387 cbq_deactivate_class(cl);
1388}
1389
1da177e4
LT
1390static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1391{
1392 struct cbq_sched_data *q = qdisc_priv(sch);
1393 struct cbq_class *cl = cbq_class_lookup(q, classid);
1394
1395 if (cl) {
1396 cl->refcnt++;
1397 return (unsigned long)cl;
1398 }
1399 return 0;
1400}
1401
1da177e4
LT
1402static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1403{
1404 struct cbq_sched_data *q = qdisc_priv(sch);
1405
547b792c 1406 WARN_ON(cl->filters);
1da177e4 1407
ff31ab56 1408 tcf_destroy_chain(&cl->filter_list);
1da177e4
LT
1409 qdisc_destroy(cl->q);
1410 qdisc_put_rtab(cl->R_tab);
1c0d32fd 1411 gen_kill_estimator(&cl->rate_est);
1da177e4
LT
1412 if (cl != &q->link)
1413 kfree(cl);
1414}
1415
cc7ec456 1416static void cbq_destroy(struct Qdisc *sch)
1da177e4
LT
1417{
1418 struct cbq_sched_data *q = qdisc_priv(sch);
b67bfe0d 1419 struct hlist_node *next;
1da177e4 1420 struct cbq_class *cl;
cc7ec456 1421 unsigned int h;
1da177e4 1422
c3bc7cff 1423#ifdef CONFIG_NET_CLS_ACT
1da177e4
LT
1424 q->rx_class = NULL;
1425#endif
1426 /*
1427 * Filters must be destroyed first because we don't destroy the
1428 * classes from root to leafs which means that filters can still
1429 * be bound to classes which have been destroyed already. --TGR '04
1430 */
d77fea2e 1431 for (h = 0; h < q->clhash.hashsize; h++) {
b67bfe0d 1432 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
ff31ab56 1433 tcf_destroy_chain(&cl->filter_list);
b00b4bf9 1434 }
d77fea2e 1435 for (h = 0; h < q->clhash.hashsize; h++) {
b67bfe0d 1436 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
d77fea2e 1437 common.hnode)
1da177e4 1438 cbq_destroy_class(sch, cl);
1da177e4 1439 }
d77fea2e 1440 qdisc_class_hash_destroy(&q->clhash);
1da177e4
LT
1441}
1442
1443static void cbq_put(struct Qdisc *sch, unsigned long arg)
1444{
cc7ec456 1445 struct cbq_class *cl = (struct cbq_class *)arg;
1da177e4
LT
1446
1447 if (--cl->refcnt == 0) {
c3bc7cff 1448#ifdef CONFIG_NET_CLS_ACT
102396ae 1449 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1da177e4
LT
1450 struct cbq_sched_data *q = qdisc_priv(sch);
1451
7698b4fc 1452 spin_lock_bh(root_lock);
1da177e4
LT
1453 if (q->rx_class == cl)
1454 q->rx_class = NULL;
7698b4fc 1455 spin_unlock_bh(root_lock);
1da177e4
LT
1456#endif
1457
1458 cbq_destroy_class(sch, cl);
1459 }
1460}
1461
1462static int
1e90474c 1463cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1da177e4
LT
1464 unsigned long *arg)
1465{
1466 int err;
1467 struct cbq_sched_data *q = qdisc_priv(sch);
cc7ec456 1468 struct cbq_class *cl = (struct cbq_class *)*arg;
1e90474c
PM
1469 struct nlattr *opt = tca[TCA_OPTIONS];
1470 struct nlattr *tb[TCA_CBQ_MAX + 1];
1da177e4
LT
1471 struct cbq_class *parent;
1472 struct qdisc_rate_table *rtab = NULL;
1473
cee63723 1474 if (opt == NULL)
1da177e4
LT
1475 return -EINVAL;
1476
fceb6435 1477 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
cee63723
PM
1478 if (err < 0)
1479 return err;
1480
dd47c1fa 1481 if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
c3498d34
FW
1482 return -EOPNOTSUPP;
1483
1da177e4
LT
1484 if (cl) {
1485 /* Check parent */
1486 if (parentid) {
d77fea2e
PM
1487 if (cl->tparent &&
1488 cl->tparent->common.classid != parentid)
1da177e4
LT
1489 return -EINVAL;
1490 if (!cl->tparent && parentid != TC_H_ROOT)
1491 return -EINVAL;
1492 }
1493
1e90474c 1494 if (tb[TCA_CBQ_RATE]) {
71bcb09a
SH
1495 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1496 tb[TCA_CBQ_RTAB]);
1da177e4
LT
1497 if (rtab == NULL)
1498 return -EINVAL;
1499 }
1500
71bcb09a 1501 if (tca[TCA_RATE]) {
22e0f8b9
JF
1502 err = gen_replace_estimator(&cl->bstats, NULL,
1503 &cl->rate_est,
edb09eb1
ED
1504 NULL,
1505 qdisc_root_sleeping_running(sch),
71bcb09a
SH
1506 tca[TCA_RATE]);
1507 if (err) {
79c11f2e 1508 qdisc_put_rtab(rtab);
71bcb09a
SH
1509 return err;
1510 }
1511 }
1512
1da177e4
LT
1513 /* Change class parameters */
1514 sch_tree_lock(sch);
1515
1516 if (cl->next_alive != NULL)
1517 cbq_deactivate_class(cl);
1518
1519 if (rtab) {
b94c8afc
PM
1520 qdisc_put_rtab(cl->R_tab);
1521 cl->R_tab = rtab;
1da177e4
LT
1522 }
1523
1e90474c
PM
1524 if (tb[TCA_CBQ_LSSOPT])
1525 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1da177e4 1526
1e90474c 1527 if (tb[TCA_CBQ_WRROPT]) {
1da177e4 1528 cbq_rmprio(q, cl);
1e90474c 1529 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1da177e4
LT
1530 }
1531
1e90474c
PM
1532 if (tb[TCA_CBQ_FOPT])
1533 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1da177e4
LT
1534
1535 if (cl->q->q.qlen)
1536 cbq_activate_class(cl);
1537
1538 sch_tree_unlock(sch);
1539
1da177e4
LT
1540 return 0;
1541 }
1542
1543 if (parentid == TC_H_ROOT)
1544 return -EINVAL;
1545
1e90474c
PM
1546 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1547 tb[TCA_CBQ_LSSOPT] == NULL)
1da177e4
LT
1548 return -EINVAL;
1549
1e90474c 1550 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1da177e4
LT
1551 if (rtab == NULL)
1552 return -EINVAL;
1553
1554 if (classid) {
1555 err = -EINVAL;
cc7ec456
ED
1556 if (TC_H_MAJ(classid ^ sch->handle) ||
1557 cbq_class_lookup(q, classid))
1da177e4
LT
1558 goto failure;
1559 } else {
1560 int i;
cc7ec456 1561 classid = TC_H_MAKE(sch->handle, 0x8000);
1da177e4 1562
cc7ec456 1563 for (i = 0; i < 0x8000; i++) {
1da177e4
LT
1564 if (++q->hgenerator >= 0x8000)
1565 q->hgenerator = 1;
1566 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1567 break;
1568 }
1569 err = -ENOSR;
1570 if (i >= 0x8000)
1571 goto failure;
1572 classid = classid|q->hgenerator;
1573 }
1574
1575 parent = &q->link;
1576 if (parentid) {
1577 parent = cbq_class_lookup(q, parentid);
1578 err = -EINVAL;
1579 if (parent == NULL)
1580 goto failure;
1581 }
1582
1583 err = -ENOBUFS;
0da974f4 1584 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1da177e4
LT
1585 if (cl == NULL)
1586 goto failure;
71bcb09a
SH
1587
1588 if (tca[TCA_RATE]) {
22e0f8b9 1589 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
edb09eb1
ED
1590 NULL,
1591 qdisc_root_sleeping_running(sch),
71bcb09a
SH
1592 tca[TCA_RATE]);
1593 if (err) {
1594 kfree(cl);
1595 goto failure;
1596 }
1597 }
1598
1da177e4
LT
1599 cl->R_tab = rtab;
1600 rtab = NULL;
1601 cl->refcnt = 1;
3511c913
CG
1602 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1603 if (!cl->q)
1da177e4 1604 cl->q = &noop_qdisc;
49b49971
JK
1605 else
1606 qdisc_hash_add(cl->q, true);
1607
d77fea2e 1608 cl->common.classid = classid;
1da177e4
LT
1609 cl->tparent = parent;
1610 cl->qdisc = sch;
1611 cl->allot = parent->allot;
1612 cl->quantum = cl->allot;
1613 cl->weight = cl->R_tab->rate.rate;
1da177e4
LT
1614
1615 sch_tree_lock(sch);
1616 cbq_link_class(cl);
1617 cl->borrow = cl->tparent;
1618 if (cl->tparent != &q->link)
1619 cl->share = cl->tparent;
1620 cbq_adjust_levels(parent);
1621 cl->minidle = -0x7FFFFFFF;
1e90474c
PM
1622 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1623 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
cc7ec456 1624 if (cl->ewma_log == 0)
1da177e4 1625 cl->ewma_log = q->link.ewma_log;
cc7ec456 1626 if (cl->maxidle == 0)
1da177e4 1627 cl->maxidle = q->link.maxidle;
cc7ec456 1628 if (cl->avpkt == 0)
1da177e4 1629 cl->avpkt = q->link.avpkt;
1e90474c
PM
1630 if (tb[TCA_CBQ_FOPT])
1631 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1da177e4
LT
1632 sch_tree_unlock(sch);
1633
d77fea2e
PM
1634 qdisc_class_hash_grow(sch, &q->clhash);
1635
1da177e4
LT
1636 *arg = (unsigned long)cl;
1637 return 0;
1638
1639failure:
1640 qdisc_put_rtab(rtab);
1641 return err;
1642}
1643
1644static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1645{
1646 struct cbq_sched_data *q = qdisc_priv(sch);
cc7ec456 1647 struct cbq_class *cl = (struct cbq_class *)arg;
2ccccf5f 1648 unsigned int qlen, backlog;
1da177e4
LT
1649
1650 if (cl->filters || cl->children || cl == &q->link)
1651 return -EBUSY;
1652
1653 sch_tree_lock(sch);
1654
a37ef2e3 1655 qlen = cl->q->q.qlen;
2ccccf5f 1656 backlog = cl->q->qstats.backlog;
a37ef2e3 1657 qdisc_reset(cl->q);
2ccccf5f 1658 qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
a37ef2e3 1659
1da177e4
LT
1660 if (cl->next_alive)
1661 cbq_deactivate_class(cl);
1662
1663 if (q->tx_borrowed == cl)
1664 q->tx_borrowed = q->tx_class;
1665 if (q->tx_class == cl) {
1666 q->tx_class = NULL;
1667 q->tx_borrowed = NULL;
1668 }
c3bc7cff 1669#ifdef CONFIG_NET_CLS_ACT
1da177e4
LT
1670 if (q->rx_class == cl)
1671 q->rx_class = NULL;
1672#endif
1673
1674 cbq_unlink_class(cl);
1675 cbq_adjust_levels(cl->tparent);
1676 cl->defmap = 0;
1677 cbq_sync_defmap(cl);
1678
1679 cbq_rmprio(q, cl);
1680 sch_tree_unlock(sch);
1681
7cd0a638
JP
1682 BUG_ON(--cl->refcnt == 0);
1683 /*
1684 * This shouldn't happen: we "hold" one cops->get() when called
1685 * from tc_ctl_tclass; the destroy method is done from cops->put().
1686 */
1da177e4
LT
1687
1688 return 0;
1689}
1690
25d8c0d5
JF
1691static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1692 unsigned long arg)
1da177e4
LT
1693{
1694 struct cbq_sched_data *q = qdisc_priv(sch);
1695 struct cbq_class *cl = (struct cbq_class *)arg;
1696
1697 if (cl == NULL)
1698 cl = &q->link;
1699
1700 return &cl->filter_list;
1701}
1702
1703static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1704 u32 classid)
1705{
1706 struct cbq_sched_data *q = qdisc_priv(sch);
cc7ec456 1707 struct cbq_class *p = (struct cbq_class *)parent;
1da177e4
LT
1708 struct cbq_class *cl = cbq_class_lookup(q, classid);
1709
1710 if (cl) {
1711 if (p && p->level <= cl->level)
1712 return 0;
1713 cl->filters++;
1714 return (unsigned long)cl;
1715 }
1716 return 0;
1717}
1718
1719static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1720{
cc7ec456 1721 struct cbq_class *cl = (struct cbq_class *)arg;
1da177e4
LT
1722
1723 cl->filters--;
1724}
1725
1726static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1727{
1728 struct cbq_sched_data *q = qdisc_priv(sch);
d77fea2e 1729 struct cbq_class *cl;
cc7ec456 1730 unsigned int h;
1da177e4
LT
1731
1732 if (arg->stop)
1733 return;
1734
d77fea2e 1735 for (h = 0; h < q->clhash.hashsize; h++) {
b67bfe0d 1736 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1da177e4
LT
1737 if (arg->count < arg->skip) {
1738 arg->count++;
1739 continue;
1740 }
1741 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1742 arg->stop = 1;
1743 return;
1744 }
1745 arg->count++;
1746 }
1747 }
1748}
1749
20fea08b 1750static const struct Qdisc_class_ops cbq_class_ops = {
1da177e4
LT
1751 .graft = cbq_graft,
1752 .leaf = cbq_leaf,
a37ef2e3 1753 .qlen_notify = cbq_qlen_notify,
1da177e4
LT
1754 .get = cbq_get,
1755 .put = cbq_put,
1756 .change = cbq_change_class,
1757 .delete = cbq_delete,
1758 .walk = cbq_walk,
1759 .tcf_chain = cbq_find_tcf,
1760 .bind_tcf = cbq_bind_filter,
1761 .unbind_tcf = cbq_unbind_filter,
1762 .dump = cbq_dump_class,
1763 .dump_stats = cbq_dump_class_stats,
1764};
1765
20fea08b 1766static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1da177e4
LT
1767 .next = NULL,
1768 .cl_ops = &cbq_class_ops,
1769 .id = "cbq",
1770 .priv_size = sizeof(struct cbq_sched_data),
1771 .enqueue = cbq_enqueue,
1772 .dequeue = cbq_dequeue,
77be155c 1773 .peek = qdisc_peek_dequeued,
1da177e4
LT
1774 .init = cbq_init,
1775 .reset = cbq_reset,
1776 .destroy = cbq_destroy,
1777 .change = NULL,
1778 .dump = cbq_dump,
1779 .dump_stats = cbq_dump_stats,
1780 .owner = THIS_MODULE,
1781};
1782
1783static int __init cbq_module_init(void)
1784{
1785 return register_qdisc(&cbq_qdisc_ops);
1786}
10297b99 1787static void __exit cbq_module_exit(void)
1da177e4
LT
1788{
1789 unregister_qdisc(&cbq_qdisc_ops);
1790}
1791module_init(cbq_module_init)
1792module_exit(cbq_module_exit)
1793MODULE_LICENSE("GPL");