]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/sched/sch_cbq.c
Merge tag 'nfsd-4.11' of git://linux-nfs.org/~bfields/linux
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_cbq.c
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23
24
25 /* Class-Based Queueing (CBQ) algorithm.
26 =======================================
27
28 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
29 Management Models for Packet Networks",
30 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
31
32 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
33
34 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
35 Parameters", 1996
36
37 [4] Sally Floyd and Michael Speer, "Experimental Results
38 for Class-Based Queueing", 1998, not published.
39
40 -----------------------------------------------------------------------
41
42 Algorithm skeleton was taken from NS simulator cbq.cc.
43 If someone wants to check this code against the LBL version,
44 he should take into account that ONLY the skeleton was borrowed,
45 the implementation is different. Particularly:
46
47 --- The WRR algorithm is different. Our version looks more
48 reasonable (I hope) and works when quanta are allowed to be
49 less than MTU, which is always the case when real time classes
50 have small rates. Note, that the statement of [3] is
51 incomplete, delay may actually be estimated even if class
52 per-round allotment is less than MTU. Namely, if per-round
53 allotment is W*r_i, and r_1+...+r_k = r < 1
54
55 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
56
57 In the worst case we have IntServ estimate with D = W*r+k*MTU
58 and C = MTU*r. The proof (if correct at all) is trivial.
59
60
61 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
62 interpret some places, which look like wrong translations
63 from NS. Anyone is advised to find these differences
64 and explain to me, why I am wrong 8).
65
66 --- Linux has no EOI event, so that we cannot estimate true class
67 idle time. Workaround is to consider the next dequeue event
68 as sign that previous packet is finished. This is wrong because of
69 internal device queueing, but on a permanently loaded link it is true.
70 Moreover, combined with clock integrator, this scheme looks
71 very close to an ideal solution. */
72
73 struct cbq_sched_data;
74
75
76 struct cbq_class {
77 struct Qdisc_class_common common;
78 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79
80 /* Parameters */
81 unsigned char priority; /* class priority */
82 unsigned char priority2; /* priority to be used after overlimit */
83 unsigned char ewma_log; /* time constant for idle time calculation */
84
85 u32 defmap;
86
87 /* Link-sharing scheduler parameters */
88 long maxidle; /* Class parameters: see below. */
89 long offtime;
90 long minidle;
91 u32 avpkt;
92 struct qdisc_rate_table *R_tab;
93
94 /* General scheduler (WRR) parameters */
95 long allot;
96 long quantum; /* Allotment per WRR round */
97 long weight; /* Relative allotment: see below */
98
99 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
100 struct cbq_class *split; /* Ptr to split node */
101 struct cbq_class *share; /* Ptr to LS parent in the class tree */
102 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
103 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
104 parent otherwise */
105 struct cbq_class *sibling; /* Sibling chain */
106 struct cbq_class *children; /* Pointer to children chain */
107
108 struct Qdisc *q; /* Elementary queueing discipline */
109
110
111 /* Variables */
112 unsigned char cpriority; /* Effective priority */
113 unsigned char delayed;
114 unsigned char level; /* level of the class in hierarchy:
115 0 for leaf classes, and maximal
116 level of children + 1 for nodes.
117 */
118
119 psched_time_t last; /* Last end of service */
120 psched_time_t undertime;
121 long avgidle;
122 long deficit; /* Saved deficit for WRR */
123 psched_time_t penalized;
124 struct gnet_stats_basic_packed bstats;
125 struct gnet_stats_queue qstats;
126 struct net_rate_estimator __rcu *rate_est;
127 struct tc_cbq_xstats xstats;
128
129 struct tcf_proto __rcu *filter_list;
130
131 int refcnt;
132 int filters;
133
134 struct cbq_class *defaults[TC_PRIO_MAX + 1];
135 };
136
137 struct cbq_sched_data {
138 struct Qdisc_class_hash clhash; /* Hash table of all classes */
139 int nclasses[TC_CBQ_MAXPRIO + 1];
140 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
141
142 struct cbq_class link;
143
144 unsigned int activemask;
145 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
146 with backlog */
147
148 #ifdef CONFIG_NET_CLS_ACT
149 struct cbq_class *rx_class;
150 #endif
151 struct cbq_class *tx_class;
152 struct cbq_class *tx_borrowed;
153 int tx_len;
154 psched_time_t now; /* Cached timestamp */
155 unsigned int pmask;
156
157 struct hrtimer delay_timer;
158 struct qdisc_watchdog watchdog; /* Watchdog timer,
159 started when CBQ has
160 backlog, but cannot
161 transmit just now */
162 psched_tdiff_t wd_expires;
163 int toplevel;
164 u32 hgenerator;
165 };
166
167
168 #define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
169
170 static inline struct cbq_class *
171 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
172 {
173 struct Qdisc_class_common *clc;
174
175 clc = qdisc_class_find(&q->clhash, classid);
176 if (clc == NULL)
177 return NULL;
178 return container_of(clc, struct cbq_class, common);
179 }
180
181 #ifdef CONFIG_NET_CLS_ACT
182
183 static struct cbq_class *
184 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
185 {
186 struct cbq_class *cl;
187
188 for (cl = this->tparent; cl; cl = cl->tparent) {
189 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
190
191 if (new != NULL && new != this)
192 return new;
193 }
194 return NULL;
195 }
196
197 #endif
198
199 /* Classify packet. The procedure is pretty complicated, but
200 * it allows us to combine link sharing and priority scheduling
201 * transparently.
202 *
203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
204 * so that it resolves to split nodes. Then packets are classified
205 * by logical priority, or a more specific classifier may be attached
206 * to the split node.
207 */
208
209 static struct cbq_class *
210 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
211 {
212 struct cbq_sched_data *q = qdisc_priv(sch);
213 struct cbq_class *head = &q->link;
214 struct cbq_class **defmap;
215 struct cbq_class *cl = NULL;
216 u32 prio = skb->priority;
217 struct tcf_proto *fl;
218 struct tcf_result res;
219
220 /*
221 * Step 1. If skb->priority points to one of our classes, use it.
222 */
223 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
224 (cl = cbq_class_lookup(q, prio)) != NULL)
225 return cl;
226
227 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
228 for (;;) {
229 int result = 0;
230 defmap = head->defaults;
231
232 fl = rcu_dereference_bh(head->filter_list);
233 /*
234 * Step 2+n. Apply classifier.
235 */
236 result = tc_classify(skb, fl, &res, true);
237 if (!fl || result < 0)
238 goto fallback;
239
240 cl = (void *)res.class;
241 if (!cl) {
242 if (TC_H_MAJ(res.classid))
243 cl = cbq_class_lookup(q, res.classid);
244 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
245 cl = defmap[TC_PRIO_BESTEFFORT];
246
247 if (cl == NULL)
248 goto fallback;
249 }
250 if (cl->level >= head->level)
251 goto fallback;
252 #ifdef CONFIG_NET_CLS_ACT
253 switch (result) {
254 case TC_ACT_QUEUED:
255 case TC_ACT_STOLEN:
256 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
257 case TC_ACT_SHOT:
258 return NULL;
259 case TC_ACT_RECLASSIFY:
260 return cbq_reclassify(skb, cl);
261 }
262 #endif
263 if (cl->level == 0)
264 return cl;
265
266 /*
267 * Step 3+n. If classifier selected a link sharing class,
268 * apply agency specific classifier.
269 * Repeat this procdure until we hit a leaf node.
270 */
271 head = cl;
272 }
273
274 fallback:
275 cl = head;
276
277 /*
278 * Step 4. No success...
279 */
280 if (TC_H_MAJ(prio) == 0 &&
281 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
282 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
283 return head;
284
285 return cl;
286 }
287
288 /*
289 * A packet has just been enqueued on the empty class.
290 * cbq_activate_class adds it to the tail of active class list
291 * of its priority band.
292 */
293
294 static inline void cbq_activate_class(struct cbq_class *cl)
295 {
296 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
297 int prio = cl->cpriority;
298 struct cbq_class *cl_tail;
299
300 cl_tail = q->active[prio];
301 q->active[prio] = cl;
302
303 if (cl_tail != NULL) {
304 cl->next_alive = cl_tail->next_alive;
305 cl_tail->next_alive = cl;
306 } else {
307 cl->next_alive = cl;
308 q->activemask |= (1<<prio);
309 }
310 }
311
312 /*
313 * Unlink class from active chain.
314 * Note that this same procedure is done directly in cbq_dequeue*
315 * during round-robin procedure.
316 */
317
318 static void cbq_deactivate_class(struct cbq_class *this)
319 {
320 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
321 int prio = this->cpriority;
322 struct cbq_class *cl;
323 struct cbq_class *cl_prev = q->active[prio];
324
325 do {
326 cl = cl_prev->next_alive;
327 if (cl == this) {
328 cl_prev->next_alive = cl->next_alive;
329 cl->next_alive = NULL;
330
331 if (cl == q->active[prio]) {
332 q->active[prio] = cl_prev;
333 if (cl == q->active[prio]) {
334 q->active[prio] = NULL;
335 q->activemask &= ~(1<<prio);
336 return;
337 }
338 }
339 return;
340 }
341 } while ((cl_prev = cl) != q->active[prio]);
342 }
343
344 static void
345 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
346 {
347 int toplevel = q->toplevel;
348
349 if (toplevel > cl->level) {
350 psched_time_t now = psched_get_time();
351
352 do {
353 if (cl->undertime < now) {
354 q->toplevel = cl->level;
355 return;
356 }
357 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
358 }
359 }
360
361 static int
362 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
363 struct sk_buff **to_free)
364 {
365 struct cbq_sched_data *q = qdisc_priv(sch);
366 int uninitialized_var(ret);
367 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
368
369 #ifdef CONFIG_NET_CLS_ACT
370 q->rx_class = cl;
371 #endif
372 if (cl == NULL) {
373 if (ret & __NET_XMIT_BYPASS)
374 qdisc_qstats_drop(sch);
375 __qdisc_drop(skb, to_free);
376 return ret;
377 }
378
379 ret = qdisc_enqueue(skb, cl->q, to_free);
380 if (ret == NET_XMIT_SUCCESS) {
381 sch->q.qlen++;
382 cbq_mark_toplevel(q, cl);
383 if (!cl->next_alive)
384 cbq_activate_class(cl);
385 return ret;
386 }
387
388 if (net_xmit_drop_count(ret)) {
389 qdisc_qstats_drop(sch);
390 cbq_mark_toplevel(q, cl);
391 cl->qstats.drops++;
392 }
393 return ret;
394 }
395
396 /* Overlimit action: penalize leaf class by adding offtime */
397 static void cbq_overlimit(struct cbq_class *cl)
398 {
399 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
400 psched_tdiff_t delay = cl->undertime - q->now;
401
402 if (!cl->delayed) {
403 delay += cl->offtime;
404
405 /*
406 * Class goes to sleep, so that it will have no
407 * chance to work avgidle. Let's forgive it 8)
408 *
409 * BTW cbq-2.0 has a crap in this
410 * place, apparently they forgot to shift it by cl->ewma_log.
411 */
412 if (cl->avgidle < 0)
413 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
414 if (cl->avgidle < cl->minidle)
415 cl->avgidle = cl->minidle;
416 if (delay <= 0)
417 delay = 1;
418 cl->undertime = q->now + delay;
419
420 cl->xstats.overactions++;
421 cl->delayed = 1;
422 }
423 if (q->wd_expires == 0 || q->wd_expires > delay)
424 q->wd_expires = delay;
425
426 /* Dirty work! We must schedule wakeups based on
427 * real available rate, rather than leaf rate,
428 * which may be tiny (even zero).
429 */
430 if (q->toplevel == TC_CBQ_MAXLEVEL) {
431 struct cbq_class *b;
432 psched_tdiff_t base_delay = q->wd_expires;
433
434 for (b = cl->borrow; b; b = b->borrow) {
435 delay = b->undertime - q->now;
436 if (delay < base_delay) {
437 if (delay <= 0)
438 delay = 1;
439 base_delay = delay;
440 }
441 }
442
443 q->wd_expires = base_delay;
444 }
445 }
446
447 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
448 psched_time_t now)
449 {
450 struct cbq_class *cl;
451 struct cbq_class *cl_prev = q->active[prio];
452 psched_time_t sched = now;
453
454 if (cl_prev == NULL)
455 return 0;
456
457 do {
458 cl = cl_prev->next_alive;
459 if (now - cl->penalized > 0) {
460 cl_prev->next_alive = cl->next_alive;
461 cl->next_alive = NULL;
462 cl->cpriority = cl->priority;
463 cl->delayed = 0;
464 cbq_activate_class(cl);
465
466 if (cl == q->active[prio]) {
467 q->active[prio] = cl_prev;
468 if (cl == q->active[prio]) {
469 q->active[prio] = NULL;
470 return 0;
471 }
472 }
473
474 cl = cl_prev->next_alive;
475 } else if (sched - cl->penalized > 0)
476 sched = cl->penalized;
477 } while ((cl_prev = cl) != q->active[prio]);
478
479 return sched - now;
480 }
481
482 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
483 {
484 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
485 delay_timer);
486 struct Qdisc *sch = q->watchdog.qdisc;
487 psched_time_t now;
488 psched_tdiff_t delay = 0;
489 unsigned int pmask;
490
491 now = psched_get_time();
492
493 pmask = q->pmask;
494 q->pmask = 0;
495
496 while (pmask) {
497 int prio = ffz(~pmask);
498 psched_tdiff_t tmp;
499
500 pmask &= ~(1<<prio);
501
502 tmp = cbq_undelay_prio(q, prio, now);
503 if (tmp > 0) {
504 q->pmask |= 1<<prio;
505 if (tmp < delay || delay == 0)
506 delay = tmp;
507 }
508 }
509
510 if (delay) {
511 ktime_t time;
512
513 time = 0;
514 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
515 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
516 }
517
518 __netif_schedule(qdisc_root(sch));
519 return HRTIMER_NORESTART;
520 }
521
522 /*
523 * It is mission critical procedure.
524 *
525 * We "regenerate" toplevel cutoff, if transmitting class
526 * has backlog and it is not regulated. It is not part of
527 * original CBQ description, but looks more reasonable.
528 * Probably, it is wrong. This question needs further investigation.
529 */
530
531 static inline void
532 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
533 struct cbq_class *borrowed)
534 {
535 if (cl && q->toplevel >= borrowed->level) {
536 if (cl->q->q.qlen > 1) {
537 do {
538 if (borrowed->undertime == PSCHED_PASTPERFECT) {
539 q->toplevel = borrowed->level;
540 return;
541 }
542 } while ((borrowed = borrowed->borrow) != NULL);
543 }
544 #if 0
545 /* It is not necessary now. Uncommenting it
546 will save CPU cycles, but decrease fairness.
547 */
548 q->toplevel = TC_CBQ_MAXLEVEL;
549 #endif
550 }
551 }
552
553 static void
554 cbq_update(struct cbq_sched_data *q)
555 {
556 struct cbq_class *this = q->tx_class;
557 struct cbq_class *cl = this;
558 int len = q->tx_len;
559 psched_time_t now;
560
561 q->tx_class = NULL;
562 /* Time integrator. We calculate EOS time
563 * by adding expected packet transmission time.
564 */
565 now = q->now + L2T(&q->link, len);
566
567 for ( ; cl; cl = cl->share) {
568 long avgidle = cl->avgidle;
569 long idle;
570
571 cl->bstats.packets++;
572 cl->bstats.bytes += len;
573
574 /*
575 * (now - last) is total time between packet right edges.
576 * (last_pktlen/rate) is "virtual" busy time, so that
577 *
578 * idle = (now - last) - last_pktlen/rate
579 */
580
581 idle = now - cl->last;
582 if ((unsigned long)idle > 128*1024*1024) {
583 avgidle = cl->maxidle;
584 } else {
585 idle -= L2T(cl, len);
586
587 /* true_avgidle := (1-W)*true_avgidle + W*idle,
588 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
589 * cl->avgidle == true_avgidle/W,
590 * hence:
591 */
592 avgidle += idle - (avgidle>>cl->ewma_log);
593 }
594
595 if (avgidle <= 0) {
596 /* Overlimit or at-limit */
597
598 if (avgidle < cl->minidle)
599 avgidle = cl->minidle;
600
601 cl->avgidle = avgidle;
602
603 /* Calculate expected time, when this class
604 * will be allowed to send.
605 * It will occur, when:
606 * (1-W)*true_avgidle + W*delay = 0, i.e.
607 * idle = (1/W - 1)*(-true_avgidle)
608 * or
609 * idle = (1 - W)*(-cl->avgidle);
610 */
611 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
612
613 /*
614 * That is not all.
615 * To maintain the rate allocated to the class,
616 * we add to undertime virtual clock,
617 * necessary to complete transmitted packet.
618 * (len/phys_bandwidth has been already passed
619 * to the moment of cbq_update)
620 */
621
622 idle -= L2T(&q->link, len);
623 idle += L2T(cl, len);
624
625 cl->undertime = now + idle;
626 } else {
627 /* Underlimit */
628
629 cl->undertime = PSCHED_PASTPERFECT;
630 if (avgidle > cl->maxidle)
631 cl->avgidle = cl->maxidle;
632 else
633 cl->avgidle = avgidle;
634 }
635 if ((s64)(now - cl->last) > 0)
636 cl->last = now;
637 }
638
639 cbq_update_toplevel(q, this, q->tx_borrowed);
640 }
641
642 static inline struct cbq_class *
643 cbq_under_limit(struct cbq_class *cl)
644 {
645 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
646 struct cbq_class *this_cl = cl;
647
648 if (cl->tparent == NULL)
649 return cl;
650
651 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
652 cl->delayed = 0;
653 return cl;
654 }
655
656 do {
657 /* It is very suspicious place. Now overlimit
658 * action is generated for not bounded classes
659 * only if link is completely congested.
660 * Though it is in agree with ancestor-only paradigm,
661 * it looks very stupid. Particularly,
662 * it means that this chunk of code will either
663 * never be called or result in strong amplification
664 * of burstiness. Dangerous, silly, and, however,
665 * no another solution exists.
666 */
667 cl = cl->borrow;
668 if (!cl) {
669 this_cl->qstats.overlimits++;
670 cbq_overlimit(this_cl);
671 return NULL;
672 }
673 if (cl->level > q->toplevel)
674 return NULL;
675 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
676
677 cl->delayed = 0;
678 return cl;
679 }
680
681 static inline struct sk_buff *
682 cbq_dequeue_prio(struct Qdisc *sch, int prio)
683 {
684 struct cbq_sched_data *q = qdisc_priv(sch);
685 struct cbq_class *cl_tail, *cl_prev, *cl;
686 struct sk_buff *skb;
687 int deficit;
688
689 cl_tail = cl_prev = q->active[prio];
690 cl = cl_prev->next_alive;
691
692 do {
693 deficit = 0;
694
695 /* Start round */
696 do {
697 struct cbq_class *borrow = cl;
698
699 if (cl->q->q.qlen &&
700 (borrow = cbq_under_limit(cl)) == NULL)
701 goto skip_class;
702
703 if (cl->deficit <= 0) {
704 /* Class exhausted its allotment per
705 * this round. Switch to the next one.
706 */
707 deficit = 1;
708 cl->deficit += cl->quantum;
709 goto next_class;
710 }
711
712 skb = cl->q->dequeue(cl->q);
713
714 /* Class did not give us any skb :-(
715 * It could occur even if cl->q->q.qlen != 0
716 * f.e. if cl->q == "tbf"
717 */
718 if (skb == NULL)
719 goto skip_class;
720
721 cl->deficit -= qdisc_pkt_len(skb);
722 q->tx_class = cl;
723 q->tx_borrowed = borrow;
724 if (borrow != cl) {
725 #ifndef CBQ_XSTATS_BORROWS_BYTES
726 borrow->xstats.borrows++;
727 cl->xstats.borrows++;
728 #else
729 borrow->xstats.borrows += qdisc_pkt_len(skb);
730 cl->xstats.borrows += qdisc_pkt_len(skb);
731 #endif
732 }
733 q->tx_len = qdisc_pkt_len(skb);
734
735 if (cl->deficit <= 0) {
736 q->active[prio] = cl;
737 cl = cl->next_alive;
738 cl->deficit += cl->quantum;
739 }
740 return skb;
741
742 skip_class:
743 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
744 /* Class is empty or penalized.
745 * Unlink it from active chain.
746 */
747 cl_prev->next_alive = cl->next_alive;
748 cl->next_alive = NULL;
749
750 /* Did cl_tail point to it? */
751 if (cl == cl_tail) {
752 /* Repair it! */
753 cl_tail = cl_prev;
754
755 /* Was it the last class in this band? */
756 if (cl == cl_tail) {
757 /* Kill the band! */
758 q->active[prio] = NULL;
759 q->activemask &= ~(1<<prio);
760 if (cl->q->q.qlen)
761 cbq_activate_class(cl);
762 return NULL;
763 }
764
765 q->active[prio] = cl_tail;
766 }
767 if (cl->q->q.qlen)
768 cbq_activate_class(cl);
769
770 cl = cl_prev;
771 }
772
773 next_class:
774 cl_prev = cl;
775 cl = cl->next_alive;
776 } while (cl_prev != cl_tail);
777 } while (deficit);
778
779 q->active[prio] = cl_prev;
780
781 return NULL;
782 }
783
784 static inline struct sk_buff *
785 cbq_dequeue_1(struct Qdisc *sch)
786 {
787 struct cbq_sched_data *q = qdisc_priv(sch);
788 struct sk_buff *skb;
789 unsigned int activemask;
790
791 activemask = q->activemask & 0xFF;
792 while (activemask) {
793 int prio = ffz(~activemask);
794 activemask &= ~(1<<prio);
795 skb = cbq_dequeue_prio(sch, prio);
796 if (skb)
797 return skb;
798 }
799 return NULL;
800 }
801
802 static struct sk_buff *
803 cbq_dequeue(struct Qdisc *sch)
804 {
805 struct sk_buff *skb;
806 struct cbq_sched_data *q = qdisc_priv(sch);
807 psched_time_t now;
808
809 now = psched_get_time();
810
811 if (q->tx_class)
812 cbq_update(q);
813
814 q->now = now;
815
816 for (;;) {
817 q->wd_expires = 0;
818
819 skb = cbq_dequeue_1(sch);
820 if (skb) {
821 qdisc_bstats_update(sch, skb);
822 sch->q.qlen--;
823 return skb;
824 }
825
826 /* All the classes are overlimit.
827 *
828 * It is possible, if:
829 *
830 * 1. Scheduler is empty.
831 * 2. Toplevel cutoff inhibited borrowing.
832 * 3. Root class is overlimit.
833 *
834 * Reset 2d and 3d conditions and retry.
835 *
836 * Note, that NS and cbq-2.0 are buggy, peeking
837 * an arbitrary class is appropriate for ancestor-only
838 * sharing, but not for toplevel algorithm.
839 *
840 * Our version is better, but slower, because it requires
841 * two passes, but it is unavoidable with top-level sharing.
842 */
843
844 if (q->toplevel == TC_CBQ_MAXLEVEL &&
845 q->link.undertime == PSCHED_PASTPERFECT)
846 break;
847
848 q->toplevel = TC_CBQ_MAXLEVEL;
849 q->link.undertime = PSCHED_PASTPERFECT;
850 }
851
852 /* No packets in scheduler or nobody wants to give them to us :-(
853 * Sigh... start watchdog timer in the last case.
854 */
855
856 if (sch->q.qlen) {
857 qdisc_qstats_overlimit(sch);
858 if (q->wd_expires)
859 qdisc_watchdog_schedule(&q->watchdog,
860 now + q->wd_expires);
861 }
862 return NULL;
863 }
864
865 /* CBQ class maintanance routines */
866
867 static void cbq_adjust_levels(struct cbq_class *this)
868 {
869 if (this == NULL)
870 return;
871
872 do {
873 int level = 0;
874 struct cbq_class *cl;
875
876 cl = this->children;
877 if (cl) {
878 do {
879 if (cl->level > level)
880 level = cl->level;
881 } while ((cl = cl->sibling) != this->children);
882 }
883 this->level = level + 1;
884 } while ((this = this->tparent) != NULL);
885 }
886
887 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
888 {
889 struct cbq_class *cl;
890 unsigned int h;
891
892 if (q->quanta[prio] == 0)
893 return;
894
895 for (h = 0; h < q->clhash.hashsize; h++) {
896 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
897 /* BUGGGG... Beware! This expression suffer of
898 * arithmetic overflows!
899 */
900 if (cl->priority == prio) {
901 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
902 q->quanta[prio];
903 }
904 if (cl->quantum <= 0 ||
905 cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
906 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
907 cl->common.classid, cl->quantum);
908 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
909 }
910 }
911 }
912 }
913
914 static void cbq_sync_defmap(struct cbq_class *cl)
915 {
916 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
917 struct cbq_class *split = cl->split;
918 unsigned int h;
919 int i;
920
921 if (split == NULL)
922 return;
923
924 for (i = 0; i <= TC_PRIO_MAX; i++) {
925 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
926 split->defaults[i] = NULL;
927 }
928
929 for (i = 0; i <= TC_PRIO_MAX; i++) {
930 int level = split->level;
931
932 if (split->defaults[i])
933 continue;
934
935 for (h = 0; h < q->clhash.hashsize; h++) {
936 struct cbq_class *c;
937
938 hlist_for_each_entry(c, &q->clhash.hash[h],
939 common.hnode) {
940 if (c->split == split && c->level < level &&
941 c->defmap & (1<<i)) {
942 split->defaults[i] = c;
943 level = c->level;
944 }
945 }
946 }
947 }
948 }
949
950 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
951 {
952 struct cbq_class *split = NULL;
953
954 if (splitid == 0) {
955 split = cl->split;
956 if (!split)
957 return;
958 splitid = split->common.classid;
959 }
960
961 if (split == NULL || split->common.classid != splitid) {
962 for (split = cl->tparent; split; split = split->tparent)
963 if (split->common.classid == splitid)
964 break;
965 }
966
967 if (split == NULL)
968 return;
969
970 if (cl->split != split) {
971 cl->defmap = 0;
972 cbq_sync_defmap(cl);
973 cl->split = split;
974 cl->defmap = def & mask;
975 } else
976 cl->defmap = (cl->defmap & ~mask) | (def & mask);
977
978 cbq_sync_defmap(cl);
979 }
980
981 static void cbq_unlink_class(struct cbq_class *this)
982 {
983 struct cbq_class *cl, **clp;
984 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
985
986 qdisc_class_hash_remove(&q->clhash, &this->common);
987
988 if (this->tparent) {
989 clp = &this->sibling;
990 cl = *clp;
991 do {
992 if (cl == this) {
993 *clp = cl->sibling;
994 break;
995 }
996 clp = &cl->sibling;
997 } while ((cl = *clp) != this->sibling);
998
999 if (this->tparent->children == this) {
1000 this->tparent->children = this->sibling;
1001 if (this->sibling == this)
1002 this->tparent->children = NULL;
1003 }
1004 } else {
1005 WARN_ON(this->sibling != this);
1006 }
1007 }
1008
1009 static void cbq_link_class(struct cbq_class *this)
1010 {
1011 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1012 struct cbq_class *parent = this->tparent;
1013
1014 this->sibling = this;
1015 qdisc_class_hash_insert(&q->clhash, &this->common);
1016
1017 if (parent == NULL)
1018 return;
1019
1020 if (parent->children == NULL) {
1021 parent->children = this;
1022 } else {
1023 this->sibling = parent->children->sibling;
1024 parent->children->sibling = this;
1025 }
1026 }
1027
1028 static void
1029 cbq_reset(struct Qdisc *sch)
1030 {
1031 struct cbq_sched_data *q = qdisc_priv(sch);
1032 struct cbq_class *cl;
1033 int prio;
1034 unsigned int h;
1035
1036 q->activemask = 0;
1037 q->pmask = 0;
1038 q->tx_class = NULL;
1039 q->tx_borrowed = NULL;
1040 qdisc_watchdog_cancel(&q->watchdog);
1041 hrtimer_cancel(&q->delay_timer);
1042 q->toplevel = TC_CBQ_MAXLEVEL;
1043 q->now = psched_get_time();
1044
1045 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1046 q->active[prio] = NULL;
1047
1048 for (h = 0; h < q->clhash.hashsize; h++) {
1049 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1050 qdisc_reset(cl->q);
1051
1052 cl->next_alive = NULL;
1053 cl->undertime = PSCHED_PASTPERFECT;
1054 cl->avgidle = cl->maxidle;
1055 cl->deficit = cl->quantum;
1056 cl->cpriority = cl->priority;
1057 }
1058 }
1059 sch->q.qlen = 0;
1060 }
1061
1062
1063 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1064 {
1065 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1066 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1067 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1068 }
1069 if (lss->change & TCF_CBQ_LSS_EWMA)
1070 cl->ewma_log = lss->ewma_log;
1071 if (lss->change & TCF_CBQ_LSS_AVPKT)
1072 cl->avpkt = lss->avpkt;
1073 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1074 cl->minidle = -(long)lss->minidle;
1075 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1076 cl->maxidle = lss->maxidle;
1077 cl->avgidle = lss->maxidle;
1078 }
1079 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1080 cl->offtime = lss->offtime;
1081 return 0;
1082 }
1083
1084 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1085 {
1086 q->nclasses[cl->priority]--;
1087 q->quanta[cl->priority] -= cl->weight;
1088 cbq_normalize_quanta(q, cl->priority);
1089 }
1090
1091 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1092 {
1093 q->nclasses[cl->priority]++;
1094 q->quanta[cl->priority] += cl->weight;
1095 cbq_normalize_quanta(q, cl->priority);
1096 }
1097
1098 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1099 {
1100 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1101
1102 if (wrr->allot)
1103 cl->allot = wrr->allot;
1104 if (wrr->weight)
1105 cl->weight = wrr->weight;
1106 if (wrr->priority) {
1107 cl->priority = wrr->priority - 1;
1108 cl->cpriority = cl->priority;
1109 if (cl->priority >= cl->priority2)
1110 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1111 }
1112
1113 cbq_addprio(q, cl);
1114 return 0;
1115 }
1116
1117 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1118 {
1119 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1120 return 0;
1121 }
1122
1123 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1124 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1125 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1126 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1127 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1128 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1129 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1130 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1131 };
1132
1133 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1134 {
1135 struct cbq_sched_data *q = qdisc_priv(sch);
1136 struct nlattr *tb[TCA_CBQ_MAX + 1];
1137 struct tc_ratespec *r;
1138 int err;
1139
1140 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1141 if (err < 0)
1142 return err;
1143
1144 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1145 return -EINVAL;
1146
1147 r = nla_data(tb[TCA_CBQ_RATE]);
1148
1149 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1150 return -EINVAL;
1151
1152 err = qdisc_class_hash_init(&q->clhash);
1153 if (err < 0)
1154 goto put_rtab;
1155
1156 q->link.refcnt = 1;
1157 q->link.sibling = &q->link;
1158 q->link.common.classid = sch->handle;
1159 q->link.qdisc = sch;
1160 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1161 sch->handle);
1162 if (!q->link.q)
1163 q->link.q = &noop_qdisc;
1164
1165 q->link.priority = TC_CBQ_MAXPRIO - 1;
1166 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1167 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1168 q->link.allot = psched_mtu(qdisc_dev(sch));
1169 q->link.quantum = q->link.allot;
1170 q->link.weight = q->link.R_tab->rate.rate;
1171
1172 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1173 q->link.avpkt = q->link.allot/2;
1174 q->link.minidle = -0x7FFFFFFF;
1175
1176 qdisc_watchdog_init(&q->watchdog, sch);
1177 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1178 q->delay_timer.function = cbq_undelay;
1179 q->toplevel = TC_CBQ_MAXLEVEL;
1180 q->now = psched_get_time();
1181
1182 cbq_link_class(&q->link);
1183
1184 if (tb[TCA_CBQ_LSSOPT])
1185 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1186
1187 cbq_addprio(q, &q->link);
1188 return 0;
1189
1190 put_rtab:
1191 qdisc_put_rtab(q->link.R_tab);
1192 return err;
1193 }
1194
1195 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1196 {
1197 unsigned char *b = skb_tail_pointer(skb);
1198
1199 if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1200 goto nla_put_failure;
1201 return skb->len;
1202
1203 nla_put_failure:
1204 nlmsg_trim(skb, b);
1205 return -1;
1206 }
1207
1208 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1209 {
1210 unsigned char *b = skb_tail_pointer(skb);
1211 struct tc_cbq_lssopt opt;
1212
1213 opt.flags = 0;
1214 if (cl->borrow == NULL)
1215 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1216 if (cl->share == NULL)
1217 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1218 opt.ewma_log = cl->ewma_log;
1219 opt.level = cl->level;
1220 opt.avpkt = cl->avpkt;
1221 opt.maxidle = cl->maxidle;
1222 opt.minidle = (u32)(-cl->minidle);
1223 opt.offtime = cl->offtime;
1224 opt.change = ~0;
1225 if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1226 goto nla_put_failure;
1227 return skb->len;
1228
1229 nla_put_failure:
1230 nlmsg_trim(skb, b);
1231 return -1;
1232 }
1233
1234 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1235 {
1236 unsigned char *b = skb_tail_pointer(skb);
1237 struct tc_cbq_wrropt opt;
1238
1239 memset(&opt, 0, sizeof(opt));
1240 opt.flags = 0;
1241 opt.allot = cl->allot;
1242 opt.priority = cl->priority + 1;
1243 opt.cpriority = cl->cpriority + 1;
1244 opt.weight = cl->weight;
1245 if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1246 goto nla_put_failure;
1247 return skb->len;
1248
1249 nla_put_failure:
1250 nlmsg_trim(skb, b);
1251 return -1;
1252 }
1253
1254 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1255 {
1256 unsigned char *b = skb_tail_pointer(skb);
1257 struct tc_cbq_fopt opt;
1258
1259 if (cl->split || cl->defmap) {
1260 opt.split = cl->split ? cl->split->common.classid : 0;
1261 opt.defmap = cl->defmap;
1262 opt.defchange = ~0;
1263 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1264 goto nla_put_failure;
1265 }
1266 return skb->len;
1267
1268 nla_put_failure:
1269 nlmsg_trim(skb, b);
1270 return -1;
1271 }
1272
1273 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1274 {
1275 if (cbq_dump_lss(skb, cl) < 0 ||
1276 cbq_dump_rate(skb, cl) < 0 ||
1277 cbq_dump_wrr(skb, cl) < 0 ||
1278 cbq_dump_fopt(skb, cl) < 0)
1279 return -1;
1280 return 0;
1281 }
1282
1283 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1284 {
1285 struct cbq_sched_data *q = qdisc_priv(sch);
1286 struct nlattr *nest;
1287
1288 nest = nla_nest_start(skb, TCA_OPTIONS);
1289 if (nest == NULL)
1290 goto nla_put_failure;
1291 if (cbq_dump_attr(skb, &q->link) < 0)
1292 goto nla_put_failure;
1293 return nla_nest_end(skb, nest);
1294
1295 nla_put_failure:
1296 nla_nest_cancel(skb, nest);
1297 return -1;
1298 }
1299
1300 static int
1301 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1302 {
1303 struct cbq_sched_data *q = qdisc_priv(sch);
1304
1305 q->link.xstats.avgidle = q->link.avgidle;
1306 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1307 }
1308
1309 static int
1310 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1311 struct sk_buff *skb, struct tcmsg *tcm)
1312 {
1313 struct cbq_class *cl = (struct cbq_class *)arg;
1314 struct nlattr *nest;
1315
1316 if (cl->tparent)
1317 tcm->tcm_parent = cl->tparent->common.classid;
1318 else
1319 tcm->tcm_parent = TC_H_ROOT;
1320 tcm->tcm_handle = cl->common.classid;
1321 tcm->tcm_info = cl->q->handle;
1322
1323 nest = nla_nest_start(skb, TCA_OPTIONS);
1324 if (nest == NULL)
1325 goto nla_put_failure;
1326 if (cbq_dump_attr(skb, cl) < 0)
1327 goto nla_put_failure;
1328 return nla_nest_end(skb, nest);
1329
1330 nla_put_failure:
1331 nla_nest_cancel(skb, nest);
1332 return -1;
1333 }
1334
1335 static int
1336 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1337 struct gnet_dump *d)
1338 {
1339 struct cbq_sched_data *q = qdisc_priv(sch);
1340 struct cbq_class *cl = (struct cbq_class *)arg;
1341
1342 cl->xstats.avgidle = cl->avgidle;
1343 cl->xstats.undertime = 0;
1344
1345 if (cl->undertime != PSCHED_PASTPERFECT)
1346 cl->xstats.undertime = cl->undertime - q->now;
1347
1348 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1349 d, NULL, &cl->bstats) < 0 ||
1350 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1351 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1352 return -1;
1353
1354 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1355 }
1356
1357 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1358 struct Qdisc **old)
1359 {
1360 struct cbq_class *cl = (struct cbq_class *)arg;
1361
1362 if (new == NULL) {
1363 new = qdisc_create_dflt(sch->dev_queue,
1364 &pfifo_qdisc_ops, cl->common.classid);
1365 if (new == NULL)
1366 return -ENOBUFS;
1367 }
1368
1369 *old = qdisc_replace(sch, new, &cl->q);
1370 return 0;
1371 }
1372
1373 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1374 {
1375 struct cbq_class *cl = (struct cbq_class *)arg;
1376
1377 return cl->q;
1378 }
1379
1380 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1381 {
1382 struct cbq_class *cl = (struct cbq_class *)arg;
1383
1384 if (cl->q->q.qlen == 0)
1385 cbq_deactivate_class(cl);
1386 }
1387
1388 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1389 {
1390 struct cbq_sched_data *q = qdisc_priv(sch);
1391 struct cbq_class *cl = cbq_class_lookup(q, classid);
1392
1393 if (cl) {
1394 cl->refcnt++;
1395 return (unsigned long)cl;
1396 }
1397 return 0;
1398 }
1399
1400 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1401 {
1402 struct cbq_sched_data *q = qdisc_priv(sch);
1403
1404 WARN_ON(cl->filters);
1405
1406 tcf_destroy_chain(&cl->filter_list);
1407 qdisc_destroy(cl->q);
1408 qdisc_put_rtab(cl->R_tab);
1409 gen_kill_estimator(&cl->rate_est);
1410 if (cl != &q->link)
1411 kfree(cl);
1412 }
1413
1414 static void cbq_destroy(struct Qdisc *sch)
1415 {
1416 struct cbq_sched_data *q = qdisc_priv(sch);
1417 struct hlist_node *next;
1418 struct cbq_class *cl;
1419 unsigned int h;
1420
1421 #ifdef CONFIG_NET_CLS_ACT
1422 q->rx_class = NULL;
1423 #endif
1424 /*
1425 * Filters must be destroyed first because we don't destroy the
1426 * classes from root to leafs which means that filters can still
1427 * be bound to classes which have been destroyed already. --TGR '04
1428 */
1429 for (h = 0; h < q->clhash.hashsize; h++) {
1430 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1431 tcf_destroy_chain(&cl->filter_list);
1432 }
1433 for (h = 0; h < q->clhash.hashsize; h++) {
1434 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1435 common.hnode)
1436 cbq_destroy_class(sch, cl);
1437 }
1438 qdisc_class_hash_destroy(&q->clhash);
1439 }
1440
1441 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1442 {
1443 struct cbq_class *cl = (struct cbq_class *)arg;
1444
1445 if (--cl->refcnt == 0) {
1446 #ifdef CONFIG_NET_CLS_ACT
1447 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1448 struct cbq_sched_data *q = qdisc_priv(sch);
1449
1450 spin_lock_bh(root_lock);
1451 if (q->rx_class == cl)
1452 q->rx_class = NULL;
1453 spin_unlock_bh(root_lock);
1454 #endif
1455
1456 cbq_destroy_class(sch, cl);
1457 }
1458 }
1459
1460 static int
1461 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1462 unsigned long *arg)
1463 {
1464 int err;
1465 struct cbq_sched_data *q = qdisc_priv(sch);
1466 struct cbq_class *cl = (struct cbq_class *)*arg;
1467 struct nlattr *opt = tca[TCA_OPTIONS];
1468 struct nlattr *tb[TCA_CBQ_MAX + 1];
1469 struct cbq_class *parent;
1470 struct qdisc_rate_table *rtab = NULL;
1471
1472 if (opt == NULL)
1473 return -EINVAL;
1474
1475 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1476 if (err < 0)
1477 return err;
1478
1479 if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1480 return -EOPNOTSUPP;
1481
1482 if (cl) {
1483 /* Check parent */
1484 if (parentid) {
1485 if (cl->tparent &&
1486 cl->tparent->common.classid != parentid)
1487 return -EINVAL;
1488 if (!cl->tparent && parentid != TC_H_ROOT)
1489 return -EINVAL;
1490 }
1491
1492 if (tb[TCA_CBQ_RATE]) {
1493 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1494 tb[TCA_CBQ_RTAB]);
1495 if (rtab == NULL)
1496 return -EINVAL;
1497 }
1498
1499 if (tca[TCA_RATE]) {
1500 err = gen_replace_estimator(&cl->bstats, NULL,
1501 &cl->rate_est,
1502 NULL,
1503 qdisc_root_sleeping_running(sch),
1504 tca[TCA_RATE]);
1505 if (err) {
1506 qdisc_put_rtab(rtab);
1507 return err;
1508 }
1509 }
1510
1511 /* Change class parameters */
1512 sch_tree_lock(sch);
1513
1514 if (cl->next_alive != NULL)
1515 cbq_deactivate_class(cl);
1516
1517 if (rtab) {
1518 qdisc_put_rtab(cl->R_tab);
1519 cl->R_tab = rtab;
1520 }
1521
1522 if (tb[TCA_CBQ_LSSOPT])
1523 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1524
1525 if (tb[TCA_CBQ_WRROPT]) {
1526 cbq_rmprio(q, cl);
1527 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1528 }
1529
1530 if (tb[TCA_CBQ_FOPT])
1531 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1532
1533 if (cl->q->q.qlen)
1534 cbq_activate_class(cl);
1535
1536 sch_tree_unlock(sch);
1537
1538 return 0;
1539 }
1540
1541 if (parentid == TC_H_ROOT)
1542 return -EINVAL;
1543
1544 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1545 tb[TCA_CBQ_LSSOPT] == NULL)
1546 return -EINVAL;
1547
1548 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1549 if (rtab == NULL)
1550 return -EINVAL;
1551
1552 if (classid) {
1553 err = -EINVAL;
1554 if (TC_H_MAJ(classid ^ sch->handle) ||
1555 cbq_class_lookup(q, classid))
1556 goto failure;
1557 } else {
1558 int i;
1559 classid = TC_H_MAKE(sch->handle, 0x8000);
1560
1561 for (i = 0; i < 0x8000; i++) {
1562 if (++q->hgenerator >= 0x8000)
1563 q->hgenerator = 1;
1564 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1565 break;
1566 }
1567 err = -ENOSR;
1568 if (i >= 0x8000)
1569 goto failure;
1570 classid = classid|q->hgenerator;
1571 }
1572
1573 parent = &q->link;
1574 if (parentid) {
1575 parent = cbq_class_lookup(q, parentid);
1576 err = -EINVAL;
1577 if (parent == NULL)
1578 goto failure;
1579 }
1580
1581 err = -ENOBUFS;
1582 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1583 if (cl == NULL)
1584 goto failure;
1585
1586 if (tca[TCA_RATE]) {
1587 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1588 NULL,
1589 qdisc_root_sleeping_running(sch),
1590 tca[TCA_RATE]);
1591 if (err) {
1592 kfree(cl);
1593 goto failure;
1594 }
1595 }
1596
1597 cl->R_tab = rtab;
1598 rtab = NULL;
1599 cl->refcnt = 1;
1600 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1601 if (!cl->q)
1602 cl->q = &noop_qdisc;
1603 cl->common.classid = classid;
1604 cl->tparent = parent;
1605 cl->qdisc = sch;
1606 cl->allot = parent->allot;
1607 cl->quantum = cl->allot;
1608 cl->weight = cl->R_tab->rate.rate;
1609
1610 sch_tree_lock(sch);
1611 cbq_link_class(cl);
1612 cl->borrow = cl->tparent;
1613 if (cl->tparent != &q->link)
1614 cl->share = cl->tparent;
1615 cbq_adjust_levels(parent);
1616 cl->minidle = -0x7FFFFFFF;
1617 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1618 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1619 if (cl->ewma_log == 0)
1620 cl->ewma_log = q->link.ewma_log;
1621 if (cl->maxidle == 0)
1622 cl->maxidle = q->link.maxidle;
1623 if (cl->avpkt == 0)
1624 cl->avpkt = q->link.avpkt;
1625 if (tb[TCA_CBQ_FOPT])
1626 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1627 sch_tree_unlock(sch);
1628
1629 qdisc_class_hash_grow(sch, &q->clhash);
1630
1631 *arg = (unsigned long)cl;
1632 return 0;
1633
1634 failure:
1635 qdisc_put_rtab(rtab);
1636 return err;
1637 }
1638
1639 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1640 {
1641 struct cbq_sched_data *q = qdisc_priv(sch);
1642 struct cbq_class *cl = (struct cbq_class *)arg;
1643 unsigned int qlen, backlog;
1644
1645 if (cl->filters || cl->children || cl == &q->link)
1646 return -EBUSY;
1647
1648 sch_tree_lock(sch);
1649
1650 qlen = cl->q->q.qlen;
1651 backlog = cl->q->qstats.backlog;
1652 qdisc_reset(cl->q);
1653 qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1654
1655 if (cl->next_alive)
1656 cbq_deactivate_class(cl);
1657
1658 if (q->tx_borrowed == cl)
1659 q->tx_borrowed = q->tx_class;
1660 if (q->tx_class == cl) {
1661 q->tx_class = NULL;
1662 q->tx_borrowed = NULL;
1663 }
1664 #ifdef CONFIG_NET_CLS_ACT
1665 if (q->rx_class == cl)
1666 q->rx_class = NULL;
1667 #endif
1668
1669 cbq_unlink_class(cl);
1670 cbq_adjust_levels(cl->tparent);
1671 cl->defmap = 0;
1672 cbq_sync_defmap(cl);
1673
1674 cbq_rmprio(q, cl);
1675 sch_tree_unlock(sch);
1676
1677 BUG_ON(--cl->refcnt == 0);
1678 /*
1679 * This shouldn't happen: we "hold" one cops->get() when called
1680 * from tc_ctl_tclass; the destroy method is done from cops->put().
1681 */
1682
1683 return 0;
1684 }
1685
1686 static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1687 unsigned long arg)
1688 {
1689 struct cbq_sched_data *q = qdisc_priv(sch);
1690 struct cbq_class *cl = (struct cbq_class *)arg;
1691
1692 if (cl == NULL)
1693 cl = &q->link;
1694
1695 return &cl->filter_list;
1696 }
1697
1698 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1699 u32 classid)
1700 {
1701 struct cbq_sched_data *q = qdisc_priv(sch);
1702 struct cbq_class *p = (struct cbq_class *)parent;
1703 struct cbq_class *cl = cbq_class_lookup(q, classid);
1704
1705 if (cl) {
1706 if (p && p->level <= cl->level)
1707 return 0;
1708 cl->filters++;
1709 return (unsigned long)cl;
1710 }
1711 return 0;
1712 }
1713
1714 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1715 {
1716 struct cbq_class *cl = (struct cbq_class *)arg;
1717
1718 cl->filters--;
1719 }
1720
1721 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1722 {
1723 struct cbq_sched_data *q = qdisc_priv(sch);
1724 struct cbq_class *cl;
1725 unsigned int h;
1726
1727 if (arg->stop)
1728 return;
1729
1730 for (h = 0; h < q->clhash.hashsize; h++) {
1731 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1732 if (arg->count < arg->skip) {
1733 arg->count++;
1734 continue;
1735 }
1736 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1737 arg->stop = 1;
1738 return;
1739 }
1740 arg->count++;
1741 }
1742 }
1743 }
1744
1745 static const struct Qdisc_class_ops cbq_class_ops = {
1746 .graft = cbq_graft,
1747 .leaf = cbq_leaf,
1748 .qlen_notify = cbq_qlen_notify,
1749 .get = cbq_get,
1750 .put = cbq_put,
1751 .change = cbq_change_class,
1752 .delete = cbq_delete,
1753 .walk = cbq_walk,
1754 .tcf_chain = cbq_find_tcf,
1755 .bind_tcf = cbq_bind_filter,
1756 .unbind_tcf = cbq_unbind_filter,
1757 .dump = cbq_dump_class,
1758 .dump_stats = cbq_dump_class_stats,
1759 };
1760
1761 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1762 .next = NULL,
1763 .cl_ops = &cbq_class_ops,
1764 .id = "cbq",
1765 .priv_size = sizeof(struct cbq_sched_data),
1766 .enqueue = cbq_enqueue,
1767 .dequeue = cbq_dequeue,
1768 .peek = qdisc_peek_dequeued,
1769 .init = cbq_init,
1770 .reset = cbq_reset,
1771 .destroy = cbq_destroy,
1772 .change = NULL,
1773 .dump = cbq_dump,
1774 .dump_stats = cbq_dump_stats,
1775 .owner = THIS_MODULE,
1776 };
1777
1778 static int __init cbq_module_init(void)
1779 {
1780 return register_qdisc(&cbq_qdisc_ops);
1781 }
1782 static void __exit cbq_module_exit(void)
1783 {
1784 unregister_qdisc(&cbq_qdisc_ops);
1785 }
1786 module_init(cbq_module_init)
1787 module_exit(cbq_module_exit)
1788 MODULE_LICENSE("GPL");