]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/sched/sch_tbf.c
micrel: add support for KSZ8041RNLI
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_tbf.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_tbf.c Token Bucket Filter queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11 * original idea by Martin Devera
12 *
13 */
14
1da177e4 15#include <linux/module.h>
1da177e4
LT
16#include <linux/types.h>
17#include <linux/kernel.h>
1da177e4 18#include <linux/string.h>
1da177e4 19#include <linux/errno.h>
1da177e4 20#include <linux/skbuff.h>
0ba48053 21#include <net/netlink.h>
b757c933 22#include <net/sch_generic.h>
1da177e4 23#include <net/pkt_sched.h>
4d0820cf 24#include <net/tcp.h>
1da177e4
LT
25
26
27/* Simple Token Bucket Filter.
28 =======================================
29
30 SOURCE.
31 -------
32
33 None.
34
35 Description.
36 ------------
37
38 A data flow obeys TBF with rate R and depth B, if for any
39 time interval t_i...t_f the number of transmitted bits
40 does not exceed B + R*(t_f-t_i).
41
42 Packetized version of this definition:
43 The sequence of packets of sizes s_i served at moments t_i
44 obeys TBF, if for any i<=k:
45
46 s_i+....+s_k <= B + R*(t_k - t_i)
47
48 Algorithm.
49 ----------
50
51 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
52
53 N(t+delta) = min{B/R, N(t) + delta}
54
55 If the first packet in queue has length S, it may be
56 transmitted only at the time t_* when S/R <= N(t_*),
57 and in this case N(t) jumps:
58
59 N(t_* + 0) = N(t_* - 0) - S/R.
60
61
62
63 Actually, QoS requires two TBF to be applied to a data stream.
64 One of them controls steady state burst size, another
65 one with rate P (peak rate) and depth M (equal to link MTU)
66 limits bursts at a smaller time scale.
67
68 It is easy to see that P>R, and B>M. If P is infinity, this double
69 TBF is equivalent to a single one.
70
71 When TBF works in reshaping mode, latency is estimated as:
72
73 lat = max ((L-B)/R, (L-M)/P)
74
75
76 NOTES.
77 ------
78
79 If TBF throttles, it starts a watchdog timer, which will wake it up
80 when it is ready to transmit.
81 Note that the minimal timer resolution is 1/HZ.
82 If no new packets arrive during this period,
83 or if the device is not awaken by EOI for some previous packet,
84 TBF can stop its activity for 1/HZ.
85
86
87 This means, that with depth B, the maximal rate is
88
89 R_crit = B*HZ
90
91 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
92
93 Note that the peak rate TBF is much more tough: with MTU 1500
94 P_crit = 150Kbytes/sec. So, if you need greater peak
95 rates, use alpha with HZ=1000 :-)
96
97 With classful TBF, limit is just kept for backwards compatibility.
98 It is passed to the default bfifo qdisc - if the inner qdisc is
99 changed the limit is not effective anymore.
100*/
101
cc7ec456 102struct tbf_sched_data {
1da177e4
LT
103/* Parameters */
104 u32 limit; /* Maximal length of backlog: bytes */
b757c933
JP
105 s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
106 s64 mtu;
1da177e4 107 u32 max_size;
b757c933
JP
108 struct psched_ratecfg rate;
109 struct psched_ratecfg peak;
110 bool peak_present;
1da177e4
LT
111
112/* Variables */
b757c933
JP
113 s64 tokens; /* Current number of B tokens */
114 s64 ptokens; /* Current number of P tokens */
115 s64 t_c; /* Time check-point */
1da177e4 116 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
f7f593e3 117 struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4
LT
118};
119
e43ac79a 120
4d0820cf
ED
121/*
122 * Return length of individual segments of a gso packet,
123 * including all headers (MAC, IP, TCP/UDP)
124 */
125static unsigned int skb_gso_seglen(const struct sk_buff *skb)
126{
127 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
128 const struct skb_shared_info *shinfo = skb_shinfo(skb);
129
130 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
131 hdr_len += tcp_hdrlen(skb);
132 else
133 hdr_len += sizeof(struct udphdr);
134 return hdr_len + shinfo->gso_size;
135}
136
e43ac79a
ED
137/* GSO packet is too big, segment it so that tbf can transmit
138 * each segment in time
139 */
140static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
141{
142 struct tbf_sched_data *q = qdisc_priv(sch);
143 struct sk_buff *segs, *nskb;
144 netdev_features_t features = netif_skb_features(skb);
145 int ret, nb;
146
147 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
148
149 if (IS_ERR_OR_NULL(segs))
150 return qdisc_reshape_fail(skb, sch);
151
152 nb = 0;
153 while (segs) {
154 nskb = segs->next;
155 segs->next = NULL;
4d0820cf
ED
156 qdisc_skb_cb(segs)->pkt_len = segs->len;
157 ret = qdisc_enqueue(segs, q->qdisc);
e43ac79a
ED
158 if (ret != NET_XMIT_SUCCESS) {
159 if (net_xmit_drop_count(ret))
160 sch->qstats.drops++;
161 } else {
162 nb++;
163 }
164 segs = nskb;
165 }
166 sch->q.qlen += nb;
167 if (nb > 1)
168 qdisc_tree_decrease_qlen(sch, 1 - nb);
169 consume_skb(skb);
170 return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
171}
172
cc7ec456 173static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
1da177e4
LT
174{
175 struct tbf_sched_data *q = qdisc_priv(sch);
176 int ret;
177
e43ac79a 178 if (qdisc_pkt_len(skb) > q->max_size) {
4d0820cf 179 if (skb_is_gso(skb) && skb_gso_seglen(skb) <= q->max_size)
e43ac79a 180 return tbf_segment(skb, sch);
69747650 181 return qdisc_reshape_fail(skb, sch);
e43ac79a 182 }
5f86173b 183 ret = qdisc_enqueue(skb, q->qdisc);
9871e50e 184 if (ret != NET_XMIT_SUCCESS) {
378a2f09
JP
185 if (net_xmit_drop_count(ret))
186 sch->qstats.drops++;
1da177e4
LT
187 return ret;
188 }
189
190 sch->q.qlen++;
9871e50e 191 return NET_XMIT_SUCCESS;
1da177e4
LT
192}
193
cc7ec456 194static unsigned int tbf_drop(struct Qdisc *sch)
1da177e4
LT
195{
196 struct tbf_sched_data *q = qdisc_priv(sch);
6d037a26 197 unsigned int len = 0;
1da177e4 198
6d037a26 199 if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
1da177e4
LT
200 sch->q.qlen--;
201 sch->qstats.drops++;
202 }
203 return len;
204}
205
cc7ec456 206static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
1da177e4
LT
207{
208 struct tbf_sched_data *q = qdisc_priv(sch);
209 struct sk_buff *skb;
210
03c05f0d 211 skb = q->qdisc->ops->peek(q->qdisc);
1da177e4
LT
212
213 if (skb) {
b757c933
JP
214 s64 now;
215 s64 toks;
216 s64 ptoks = 0;
0abf77e5 217 unsigned int len = qdisc_pkt_len(skb);
1da177e4 218
b757c933
JP
219 now = ktime_to_ns(ktime_get());
220 toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4 221
b757c933 222 if (q->peak_present) {
1da177e4 223 ptoks = toks + q->ptokens;
b757c933 224 if (ptoks > q->mtu)
1da177e4 225 ptoks = q->mtu;
b757c933 226 ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4
LT
227 }
228 toks += q->tokens;
b757c933 229 if (toks > q->buffer)
1da177e4 230 toks = q->buffer;
b757c933 231 toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4
LT
232
233 if ((toks|ptoks) >= 0) {
77be155c 234 skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d
JP
235 if (unlikely(!skb))
236 return NULL;
237
1da177e4
LT
238 q->t_c = now;
239 q->tokens = toks;
240 q->ptokens = ptoks;
241 sch->q.qlen--;
fd245a4a 242 qdisc_unthrottled(sch);
9190b3b3 243 qdisc_bstats_update(sch, skb);
1da177e4
LT
244 return skb;
245 }
246
b757c933
JP
247 qdisc_watchdog_schedule_ns(&q->watchdog,
248 now + max_t(long, -toks, -ptoks));
1da177e4
LT
249
250 /* Maybe we have a shorter packet in the queue,
251 which can be sent now. It sounds cool,
252 but, however, this is wrong in principle.
253 We MUST NOT reorder packets under these circumstances.
254
255 Really, if we split the flow into independent
256 subflows, it would be a very good solution.
257 This is the main idea of all FQ algorithms
258 (cf. CSZ, HPFQ, HFSC)
259 */
260
1da177e4
LT
261 sch->qstats.overlimits++;
262 }
263 return NULL;
264}
265
cc7ec456 266static void tbf_reset(struct Qdisc *sch)
1da177e4
LT
267{
268 struct tbf_sched_data *q = qdisc_priv(sch);
269
270 qdisc_reset(q->qdisc);
271 sch->q.qlen = 0;
b757c933 272 q->t_c = ktime_to_ns(ktime_get());
1da177e4
LT
273 q->tokens = q->buffer;
274 q->ptokens = q->mtu;
f7f593e3 275 qdisc_watchdog_cancel(&q->watchdog);
1da177e4
LT
276}
277
27a3421e
PM
278static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
279 [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
280 [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
281 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
a33c4a26
YY
282 [TCA_TBF_RATE64] = { .type = NLA_U64 },
283 [TCA_TBF_PRATE64] = { .type = NLA_U64 },
27a3421e
PM
284};
285
cc7ec456 286static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
1da177e4 287{
cee63723 288 int err;
1da177e4 289 struct tbf_sched_data *q = qdisc_priv(sch);
a33c4a26 290 struct nlattr *tb[TCA_TBF_MAX + 1];
1da177e4
LT
291 struct tc_tbf_qopt *qopt;
292 struct qdisc_rate_table *rtab = NULL;
293 struct qdisc_rate_table *ptab = NULL;
294 struct Qdisc *child = NULL;
cc7ec456 295 int max_size, n;
a33c4a26 296 u64 rate64 = 0, prate64 = 0;
1da177e4 297
a33c4a26 298 err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
cee63723
PM
299 if (err < 0)
300 return err;
301
302 err = -EINVAL;
27a3421e 303 if (tb[TCA_TBF_PARMS] == NULL)
1da177e4
LT
304 goto done;
305
1e90474c
PM
306 qopt = nla_data(tb[TCA_TBF_PARMS]);
307 rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
1da177e4
LT
308 if (rtab == NULL)
309 goto done;
310
311 if (qopt->peakrate.rate) {
312 if (qopt->peakrate.rate > qopt->rate.rate)
1e90474c 313 ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
1da177e4
LT
314 if (ptab == NULL)
315 goto done;
316 }
317
318 for (n = 0; n < 256; n++)
cc7ec456
ED
319 if (rtab->data[n] > qopt->buffer)
320 break;
321 max_size = (n << qopt->rate.cell_log) - 1;
1da177e4
LT
322 if (ptab) {
323 int size;
324
325 for (n = 0; n < 256; n++)
cc7ec456
ED
326 if (ptab->data[n] > qopt->mtu)
327 break;
328 size = (n << qopt->peakrate.cell_log) - 1;
329 if (size < max_size)
330 max_size = size;
1da177e4
LT
331 }
332 if (max_size < 0)
333 goto done;
334
4d0820cf
ED
335 if (max_size < psched_mtu(qdisc_dev(sch)))
336 pr_warn_ratelimited("sch_tbf: burst %u is lower than device %s mtu (%u) !\n",
337 max_size, qdisc_dev(sch)->name,
338 psched_mtu(qdisc_dev(sch)));
339
f0cd1508 340 if (q->qdisc != &noop_qdisc) {
341 err = fifo_set_limit(q->qdisc, qopt->limit);
342 if (err)
343 goto done;
344 } else if (qopt->limit > 0) {
fb0305ce
PM
345 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
346 if (IS_ERR(child)) {
347 err = PTR_ERR(child);
1da177e4 348 goto done;
fb0305ce 349 }
1da177e4
LT
350 }
351
352 sch_tree_lock(sch);
5e50da01
PM
353 if (child) {
354 qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
b94c8afc
PM
355 qdisc_destroy(q->qdisc);
356 q->qdisc = child;
5e50da01 357 }
1da177e4 358 q->limit = qopt->limit;
b757c933 359 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4 360 q->max_size = max_size;
b757c933 361 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4
LT
362 q->tokens = q->buffer;
363 q->ptokens = q->mtu;
b94c8afc 364
a33c4a26
YY
365 if (tb[TCA_TBF_RATE64])
366 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
367 psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
b757c933 368 if (ptab) {
a33c4a26
YY
369 if (tb[TCA_TBF_PRATE64])
370 prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
371 psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
b757c933
JP
372 q->peak_present = true;
373 } else {
374 q->peak_present = false;
375 }
b94c8afc 376
1da177e4
LT
377 sch_tree_unlock(sch);
378 err = 0;
379done:
380 if (rtab)
381 qdisc_put_rtab(rtab);
382 if (ptab)
383 qdisc_put_rtab(ptab);
384 return err;
385}
386
cc7ec456 387static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
1da177e4
LT
388{
389 struct tbf_sched_data *q = qdisc_priv(sch);
390
391 if (opt == NULL)
392 return -EINVAL;
393
b757c933 394 q->t_c = ktime_to_ns(ktime_get());
f7f593e3 395 qdisc_watchdog_init(&q->watchdog, sch);
1da177e4
LT
396 q->qdisc = &noop_qdisc;
397
398 return tbf_change(sch, opt);
399}
400
401static void tbf_destroy(struct Qdisc *sch)
402{
403 struct tbf_sched_data *q = qdisc_priv(sch);
404
f7f593e3 405 qdisc_watchdog_cancel(&q->watchdog);
1da177e4
LT
406 qdisc_destroy(q->qdisc);
407}
408
409static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
410{
411 struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef 412 struct nlattr *nest;
1da177e4
LT
413 struct tc_tbf_qopt opt;
414
b0460e44 415 sch->qstats.backlog = q->qdisc->qstats.backlog;
4b3550ef
PM
416 nest = nla_nest_start(skb, TCA_OPTIONS);
417 if (nest == NULL)
418 goto nla_put_failure;
1da177e4
LT
419
420 opt.limit = q->limit;
01cb71d2 421 psched_ratecfg_getrate(&opt.rate, &q->rate);
b757c933 422 if (q->peak_present)
01cb71d2 423 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4
LT
424 else
425 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933
JP
426 opt.mtu = PSCHED_NS2TICKS(q->mtu);
427 opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43
DM
428 if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
429 goto nla_put_failure;
a33c4a26
YY
430 if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
431 nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
432 goto nla_put_failure;
433 if (q->peak_present &&
434 q->peak.rate_bytes_ps >= (1ULL << 32) &&
435 nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
436 goto nla_put_failure;
1da177e4 437
4b3550ef 438 nla_nest_end(skb, nest);
1da177e4
LT
439 return skb->len;
440
1e90474c 441nla_put_failure:
4b3550ef 442 nla_nest_cancel(skb, nest);
1da177e4
LT
443 return -1;
444}
445
446static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
447 struct sk_buff *skb, struct tcmsg *tcm)
448{
449 struct tbf_sched_data *q = qdisc_priv(sch);
450
1da177e4
LT
451 tcm->tcm_handle |= TC_H_MIN(1);
452 tcm->tcm_info = q->qdisc->handle;
453
454 return 0;
455}
456
457static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
458 struct Qdisc **old)
459{
460 struct tbf_sched_data *q = qdisc_priv(sch);
461
462 if (new == NULL)
463 new = &noop_qdisc;
464
465 sch_tree_lock(sch);
b94c8afc
PM
466 *old = q->qdisc;
467 q->qdisc = new;
5e50da01 468 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1da177e4 469 qdisc_reset(*old);
1da177e4
LT
470 sch_tree_unlock(sch);
471
472 return 0;
473}
474
475static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
476{
477 struct tbf_sched_data *q = qdisc_priv(sch);
478 return q->qdisc;
479}
480
481static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
482{
483 return 1;
484}
485
486static void tbf_put(struct Qdisc *sch, unsigned long arg)
487{
488}
489
1da177e4
LT
490static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
491{
492 if (!walker->stop) {
493 if (walker->count >= walker->skip)
494 if (walker->fn(sch, 1, walker) < 0) {
495 walker->stop = 1;
496 return;
497 }
498 walker->count++;
499 }
500}
501
cc7ec456 502static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4
LT
503 .graft = tbf_graft,
504 .leaf = tbf_leaf,
505 .get = tbf_get,
506 .put = tbf_put,
1da177e4 507 .walk = tbf_walk,
1da177e4
LT
508 .dump = tbf_dump_class,
509};
510
20fea08b 511static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4
LT
512 .next = NULL,
513 .cl_ops = &tbf_class_ops,
514 .id = "tbf",
515 .priv_size = sizeof(struct tbf_sched_data),
516 .enqueue = tbf_enqueue,
517 .dequeue = tbf_dequeue,
77be155c 518 .peek = qdisc_peek_dequeued,
1da177e4
LT
519 .drop = tbf_drop,
520 .init = tbf_init,
521 .reset = tbf_reset,
522 .destroy = tbf_destroy,
523 .change = tbf_change,
524 .dump = tbf_dump,
525 .owner = THIS_MODULE,
526};
527
528static int __init tbf_module_init(void)
529{
530 return register_qdisc(&tbf_qdisc_ops);
531}
532
533static void __exit tbf_module_exit(void)
534{
535 unregister_qdisc(&tbf_qdisc_ops);
536}
537module_init(tbf_module_init)
538module_exit(tbf_module_exit)
539MODULE_LICENSE("GPL");