]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - net/sched/sch_netem.c
Merge tag 'fsnotify_for_v5.2-rc1' of ssh://gitolite.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-jammy-kernel.git] / net / sched / sch_netem.c
1 /*
2 * net/sched/sch_netem.c Network emulator
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License.
8 *
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
11 *
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31
32 #define VERSION "1.3"
33
34 /* Network Emulation Queuing algorithm.
35 ====================================
36
37 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 Network Emulation Tool
39 [2] Luigi Rizzo, DummyNet for FreeBSD
40
41 ----------------------------------------------------------------
42
43 This started out as a simple way to delay outgoing packets to
44 test TCP but has grown to include most of the functionality
45 of a full blown network emulator like NISTnet. It can delay
46 packets and add random jitter (and correlation). The random
47 distribution can be loaded from a table as well to provide
48 normal, Pareto, or experimental curves. Packet loss,
49 duplication, and reordering can also be emulated.
50
51 This qdisc does not do classification that can be handled in
52 layering other disciplines. It does not need to do bandwidth
53 control either since that can be handled by using token
54 bucket or other rate control.
55
56 Correlated Loss Generator models
57
58 Added generation of correlated loss according to the
59 "Gilbert-Elliot" model, a 4-state markov model.
60
61 References:
62 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 and intuitive loss model for packet networks and its implementation
65 in the Netem module in the Linux kernel", available in [1]
66
67 Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70
71 struct disttable {
72 u32 size;
73 s16 table[0];
74 };
75
76 struct netem_sched_data {
77 /* internal t(ime)fifo qdisc uses t_root and sch->limit */
78 struct rb_root t_root;
79
80 /* a linear queue; reduces rbtree rebalancing when jitter is low */
81 struct sk_buff *t_head;
82 struct sk_buff *t_tail;
83
84 /* optional qdisc for classful handling (NULL at netem init) */
85 struct Qdisc *qdisc;
86
87 struct qdisc_watchdog watchdog;
88
89 s64 latency;
90 s64 jitter;
91
92 u32 loss;
93 u32 ecn;
94 u32 limit;
95 u32 counter;
96 u32 gap;
97 u32 duplicate;
98 u32 reorder;
99 u32 corrupt;
100 u64 rate;
101 s32 packet_overhead;
102 u32 cell_size;
103 struct reciprocal_value cell_size_reciprocal;
104 s32 cell_overhead;
105
106 struct crndstate {
107 u32 last;
108 u32 rho;
109 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
110
111 struct disttable *delay_dist;
112
113 enum {
114 CLG_RANDOM,
115 CLG_4_STATES,
116 CLG_GILB_ELL,
117 } loss_model;
118
119 enum {
120 TX_IN_GAP_PERIOD = 1,
121 TX_IN_BURST_PERIOD,
122 LOST_IN_GAP_PERIOD,
123 LOST_IN_BURST_PERIOD,
124 } _4_state_model;
125
126 enum {
127 GOOD_STATE = 1,
128 BAD_STATE,
129 } GE_state_model;
130
131 /* Correlated Loss Generation models */
132 struct clgstate {
133 /* state of the Markov chain */
134 u8 state;
135
136 /* 4-states and Gilbert-Elliot models */
137 u32 a1; /* p13 for 4-states or p for GE */
138 u32 a2; /* p31 for 4-states or r for GE */
139 u32 a3; /* p32 for 4-states or h for GE */
140 u32 a4; /* p14 for 4-states or 1-k for GE */
141 u32 a5; /* p23 used only in 4-states */
142 } clg;
143
144 struct tc_netem_slot slot_config;
145 struct slotstate {
146 u64 slot_next;
147 s32 packets_left;
148 s32 bytes_left;
149 } slot;
150
151 struct disttable *slot_dist;
152 };
153
154 /* Time stamp put into socket buffer control block
155 * Only valid when skbs are in our internal t(ime)fifo queue.
156 *
157 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
158 * and skb->next & skb->prev are scratch space for a qdisc,
159 * we save skb->tstamp value in skb->cb[] before destroying it.
160 */
161 struct netem_skb_cb {
162 u64 time_to_send;
163 };
164
165 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
166 {
167 /* we assume we can use skb next/prev/tstamp as storage for rb_node */
168 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
169 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
170 }
171
172 /* init_crandom - initialize correlated random number generator
173 * Use entropy source for initial seed.
174 */
175 static void init_crandom(struct crndstate *state, unsigned long rho)
176 {
177 state->rho = rho;
178 state->last = prandom_u32();
179 }
180
181 /* get_crandom - correlated random number generator
182 * Next number depends on last value.
183 * rho is scaled to avoid floating point.
184 */
185 static u32 get_crandom(struct crndstate *state)
186 {
187 u64 value, rho;
188 unsigned long answer;
189
190 if (!state || state->rho == 0) /* no correlation */
191 return prandom_u32();
192
193 value = prandom_u32();
194 rho = (u64)state->rho + 1;
195 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
196 state->last = answer;
197 return answer;
198 }
199
200 /* loss_4state - 4-state model loss generator
201 * Generates losses according to the 4-state Markov chain adopted in
202 * the GI (General and Intuitive) loss model.
203 */
204 static bool loss_4state(struct netem_sched_data *q)
205 {
206 struct clgstate *clg = &q->clg;
207 u32 rnd = prandom_u32();
208
209 /*
210 * Makes a comparison between rnd and the transition
211 * probabilities outgoing from the current state, then decides the
212 * next state and if the next packet has to be transmitted or lost.
213 * The four states correspond to:
214 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
215 * LOST_IN_BURST_PERIOD => isolated losses within a gap period
216 * LOST_IN_GAP_PERIOD => lost packets within a burst period
217 * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
218 */
219 switch (clg->state) {
220 case TX_IN_GAP_PERIOD:
221 if (rnd < clg->a4) {
222 clg->state = LOST_IN_BURST_PERIOD;
223 return true;
224 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
225 clg->state = LOST_IN_GAP_PERIOD;
226 return true;
227 } else if (clg->a1 + clg->a4 < rnd) {
228 clg->state = TX_IN_GAP_PERIOD;
229 }
230
231 break;
232 case TX_IN_BURST_PERIOD:
233 if (rnd < clg->a5) {
234 clg->state = LOST_IN_GAP_PERIOD;
235 return true;
236 } else {
237 clg->state = TX_IN_BURST_PERIOD;
238 }
239
240 break;
241 case LOST_IN_GAP_PERIOD:
242 if (rnd < clg->a3)
243 clg->state = TX_IN_BURST_PERIOD;
244 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
245 clg->state = TX_IN_GAP_PERIOD;
246 } else if (clg->a2 + clg->a3 < rnd) {
247 clg->state = LOST_IN_GAP_PERIOD;
248 return true;
249 }
250 break;
251 case LOST_IN_BURST_PERIOD:
252 clg->state = TX_IN_GAP_PERIOD;
253 break;
254 }
255
256 return false;
257 }
258
259 /* loss_gilb_ell - Gilbert-Elliot model loss generator
260 * Generates losses according to the Gilbert-Elliot loss model or
261 * its special cases (Gilbert or Simple Gilbert)
262 *
263 * Makes a comparison between random number and the transition
264 * probabilities outgoing from the current state, then decides the
265 * next state. A second random number is extracted and the comparison
266 * with the loss probability of the current state decides if the next
267 * packet will be transmitted or lost.
268 */
269 static bool loss_gilb_ell(struct netem_sched_data *q)
270 {
271 struct clgstate *clg = &q->clg;
272
273 switch (clg->state) {
274 case GOOD_STATE:
275 if (prandom_u32() < clg->a1)
276 clg->state = BAD_STATE;
277 if (prandom_u32() < clg->a4)
278 return true;
279 break;
280 case BAD_STATE:
281 if (prandom_u32() < clg->a2)
282 clg->state = GOOD_STATE;
283 if (prandom_u32() > clg->a3)
284 return true;
285 }
286
287 return false;
288 }
289
290 static bool loss_event(struct netem_sched_data *q)
291 {
292 switch (q->loss_model) {
293 case CLG_RANDOM:
294 /* Random packet drop 0 => none, ~0 => all */
295 return q->loss && q->loss >= get_crandom(&q->loss_cor);
296
297 case CLG_4_STATES:
298 /* 4state loss model algorithm (used also for GI model)
299 * Extracts a value from the markov 4 state loss generator,
300 * if it is 1 drops a packet and if needed writes the event in
301 * the kernel logs
302 */
303 return loss_4state(q);
304
305 case CLG_GILB_ELL:
306 /* Gilbert-Elliot loss model algorithm
307 * Extracts a value from the Gilbert-Elliot loss generator,
308 * if it is 1 drops a packet and if needed writes the event in
309 * the kernel logs
310 */
311 return loss_gilb_ell(q);
312 }
313
314 return false; /* not reached */
315 }
316
317
318 /* tabledist - return a pseudo-randomly distributed value with mean mu and
319 * std deviation sigma. Uses table lookup to approximate the desired
320 * distribution, and a uniformly-distributed pseudo-random source.
321 */
322 static s64 tabledist(s64 mu, s32 sigma,
323 struct crndstate *state,
324 const struct disttable *dist)
325 {
326 s64 x;
327 long t;
328 u32 rnd;
329
330 if (sigma == 0)
331 return mu;
332
333 rnd = get_crandom(state);
334
335 /* default uniform distribution */
336 if (dist == NULL)
337 return ((rnd % (2 * sigma)) + mu) - sigma;
338
339 t = dist->table[rnd % dist->size];
340 x = (sigma % NETEM_DIST_SCALE) * t;
341 if (x >= 0)
342 x += NETEM_DIST_SCALE/2;
343 else
344 x -= NETEM_DIST_SCALE/2;
345
346 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
347 }
348
349 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
350 {
351 len += q->packet_overhead;
352
353 if (q->cell_size) {
354 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
355
356 if (len > cells * q->cell_size) /* extra cell needed for remainder */
357 cells++;
358 len = cells * (q->cell_size + q->cell_overhead);
359 }
360
361 return div64_u64(len * NSEC_PER_SEC, q->rate);
362 }
363
364 static void tfifo_reset(struct Qdisc *sch)
365 {
366 struct netem_sched_data *q = qdisc_priv(sch);
367 struct rb_node *p = rb_first(&q->t_root);
368
369 while (p) {
370 struct sk_buff *skb = rb_to_skb(p);
371
372 p = rb_next(p);
373 rb_erase(&skb->rbnode, &q->t_root);
374 rtnl_kfree_skbs(skb, skb);
375 }
376
377 rtnl_kfree_skbs(q->t_head, q->t_tail);
378 q->t_head = NULL;
379 q->t_tail = NULL;
380 }
381
382 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
383 {
384 struct netem_sched_data *q = qdisc_priv(sch);
385 u64 tnext = netem_skb_cb(nskb)->time_to_send;
386
387 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
388 if (q->t_tail)
389 q->t_tail->next = nskb;
390 else
391 q->t_head = nskb;
392 q->t_tail = nskb;
393 } else {
394 struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
395
396 while (*p) {
397 struct sk_buff *skb;
398
399 parent = *p;
400 skb = rb_to_skb(parent);
401 if (tnext >= netem_skb_cb(skb)->time_to_send)
402 p = &parent->rb_right;
403 else
404 p = &parent->rb_left;
405 }
406 rb_link_node(&nskb->rbnode, parent, p);
407 rb_insert_color(&nskb->rbnode, &q->t_root);
408 }
409 sch->q.qlen++;
410 }
411
412 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
413 * when we statistically choose to corrupt one, we instead segment it, returning
414 * the first packet to be corrupted, and re-enqueue the remaining frames
415 */
416 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
417 struct sk_buff **to_free)
418 {
419 struct sk_buff *segs;
420 netdev_features_t features = netif_skb_features(skb);
421
422 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
423
424 if (IS_ERR_OR_NULL(segs)) {
425 qdisc_drop(skb, sch, to_free);
426 return NULL;
427 }
428 consume_skb(skb);
429 return segs;
430 }
431
432 /*
433 * Insert one skb into qdisc.
434 * Note: parent depends on return value to account for queue length.
435 * NET_XMIT_DROP: queue length didn't change.
436 * NET_XMIT_SUCCESS: one skb was queued.
437 */
438 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
439 struct sk_buff **to_free)
440 {
441 struct netem_sched_data *q = qdisc_priv(sch);
442 /* We don't fill cb now as skb_unshare() may invalidate it */
443 struct netem_skb_cb *cb;
444 struct sk_buff *skb2;
445 struct sk_buff *segs = NULL;
446 unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
447 int nb = 0;
448 int count = 1;
449 int rc = NET_XMIT_SUCCESS;
450 int rc_drop = NET_XMIT_DROP;
451
452 /* Do not fool qdisc_drop_all() */
453 skb->prev = NULL;
454
455 /* Random duplication */
456 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
457 ++count;
458
459 /* Drop packet? */
460 if (loss_event(q)) {
461 if (q->ecn && INET_ECN_set_ce(skb))
462 qdisc_qstats_drop(sch); /* mark packet */
463 else
464 --count;
465 }
466 if (count == 0) {
467 qdisc_qstats_drop(sch);
468 __qdisc_drop(skb, to_free);
469 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
470 }
471
472 /* If a delay is expected, orphan the skb. (orphaning usually takes
473 * place at TX completion time, so _before_ the link transit delay)
474 */
475 if (q->latency || q->jitter || q->rate)
476 skb_orphan_partial(skb);
477
478 /*
479 * If we need to duplicate packet, then re-insert at top of the
480 * qdisc tree, since parent queuer expects that only one
481 * skb will be queued.
482 */
483 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
484 struct Qdisc *rootq = qdisc_root(sch);
485 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
486
487 q->duplicate = 0;
488 rootq->enqueue(skb2, rootq, to_free);
489 q->duplicate = dupsave;
490 rc_drop = NET_XMIT_SUCCESS;
491 }
492
493 /*
494 * Randomized packet corruption.
495 * Make copy if needed since we are modifying
496 * If packet is going to be hardware checksummed, then
497 * do it now in software before we mangle it.
498 */
499 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
500 if (skb_is_gso(skb)) {
501 segs = netem_segment(skb, sch, to_free);
502 if (!segs)
503 return rc_drop;
504 } else {
505 segs = skb;
506 }
507
508 skb = segs;
509 segs = segs->next;
510
511 skb = skb_unshare(skb, GFP_ATOMIC);
512 if (unlikely(!skb)) {
513 qdisc_qstats_drop(sch);
514 goto finish_segs;
515 }
516 if (skb->ip_summed == CHECKSUM_PARTIAL &&
517 skb_checksum_help(skb)) {
518 qdisc_drop(skb, sch, to_free);
519 goto finish_segs;
520 }
521
522 skb->data[prandom_u32() % skb_headlen(skb)] ^=
523 1<<(prandom_u32() % 8);
524 }
525
526 if (unlikely(sch->q.qlen >= sch->limit)) {
527 qdisc_drop_all(skb, sch, to_free);
528 return rc_drop;
529 }
530
531 qdisc_qstats_backlog_inc(sch, skb);
532
533 cb = netem_skb_cb(skb);
534 if (q->gap == 0 || /* not doing reordering */
535 q->counter < q->gap - 1 || /* inside last reordering gap */
536 q->reorder < get_crandom(&q->reorder_cor)) {
537 u64 now;
538 s64 delay;
539
540 delay = tabledist(q->latency, q->jitter,
541 &q->delay_cor, q->delay_dist);
542
543 now = ktime_get_ns();
544
545 if (q->rate) {
546 struct netem_skb_cb *last = NULL;
547
548 if (sch->q.tail)
549 last = netem_skb_cb(sch->q.tail);
550 if (q->t_root.rb_node) {
551 struct sk_buff *t_skb;
552 struct netem_skb_cb *t_last;
553
554 t_skb = skb_rb_last(&q->t_root);
555 t_last = netem_skb_cb(t_skb);
556 if (!last ||
557 t_last->time_to_send > last->time_to_send)
558 last = t_last;
559 }
560 if (q->t_tail) {
561 struct netem_skb_cb *t_last =
562 netem_skb_cb(q->t_tail);
563
564 if (!last ||
565 t_last->time_to_send > last->time_to_send)
566 last = t_last;
567 }
568
569 if (last) {
570 /*
571 * Last packet in queue is reference point (now),
572 * calculate this time bonus and subtract
573 * from delay.
574 */
575 delay -= last->time_to_send - now;
576 delay = max_t(s64, 0, delay);
577 now = last->time_to_send;
578 }
579
580 delay += packet_time_ns(qdisc_pkt_len(skb), q);
581 }
582
583 cb->time_to_send = now + delay;
584 ++q->counter;
585 tfifo_enqueue(skb, sch);
586 } else {
587 /*
588 * Do re-ordering by putting one out of N packets at the front
589 * of the queue.
590 */
591 cb->time_to_send = ktime_get_ns();
592 q->counter = 0;
593
594 __qdisc_enqueue_head(skb, &sch->q);
595 sch->qstats.requeues++;
596 }
597
598 finish_segs:
599 if (segs) {
600 while (segs) {
601 skb2 = segs->next;
602 skb_mark_not_on_list(segs);
603 qdisc_skb_cb(segs)->pkt_len = segs->len;
604 last_len = segs->len;
605 rc = qdisc_enqueue(segs, sch, to_free);
606 if (rc != NET_XMIT_SUCCESS) {
607 if (net_xmit_drop_count(rc))
608 qdisc_qstats_drop(sch);
609 } else {
610 nb++;
611 len += last_len;
612 }
613 segs = skb2;
614 }
615 sch->q.qlen += nb;
616 if (nb > 1)
617 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
618 }
619 return NET_XMIT_SUCCESS;
620 }
621
622 /* Delay the next round with a new future slot with a
623 * correct number of bytes and packets.
624 */
625
626 static void get_slot_next(struct netem_sched_data *q, u64 now)
627 {
628 s64 next_delay;
629
630 if (!q->slot_dist)
631 next_delay = q->slot_config.min_delay +
632 (prandom_u32() *
633 (q->slot_config.max_delay -
634 q->slot_config.min_delay) >> 32);
635 else
636 next_delay = tabledist(q->slot_config.dist_delay,
637 (s32)(q->slot_config.dist_jitter),
638 NULL, q->slot_dist);
639
640 q->slot.slot_next = now + next_delay;
641 q->slot.packets_left = q->slot_config.max_packets;
642 q->slot.bytes_left = q->slot_config.max_bytes;
643 }
644
645 static struct sk_buff *netem_peek(struct netem_sched_data *q)
646 {
647 struct sk_buff *skb = skb_rb_first(&q->t_root);
648 u64 t1, t2;
649
650 if (!skb)
651 return q->t_head;
652 if (!q->t_head)
653 return skb;
654
655 t1 = netem_skb_cb(skb)->time_to_send;
656 t2 = netem_skb_cb(q->t_head)->time_to_send;
657 if (t1 < t2)
658 return skb;
659 return q->t_head;
660 }
661
662 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
663 {
664 if (skb == q->t_head) {
665 q->t_head = skb->next;
666 if (!q->t_head)
667 q->t_tail = NULL;
668 } else {
669 rb_erase(&skb->rbnode, &q->t_root);
670 }
671 }
672
673 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
674 {
675 struct netem_sched_data *q = qdisc_priv(sch);
676 struct sk_buff *skb;
677
678 tfifo_dequeue:
679 skb = __qdisc_dequeue_head(&sch->q);
680 if (skb) {
681 qdisc_qstats_backlog_dec(sch, skb);
682 deliver:
683 qdisc_bstats_update(sch, skb);
684 return skb;
685 }
686 skb = netem_peek(q);
687 if (skb) {
688 u64 time_to_send;
689 u64 now = ktime_get_ns();
690
691 /* if more time remaining? */
692 time_to_send = netem_skb_cb(skb)->time_to_send;
693 if (q->slot.slot_next && q->slot.slot_next < time_to_send)
694 get_slot_next(q, now);
695
696 if (time_to_send <= now && q->slot.slot_next <= now) {
697 netem_erase_head(q, skb);
698 sch->q.qlen--;
699 qdisc_qstats_backlog_dec(sch, skb);
700 skb->next = NULL;
701 skb->prev = NULL;
702 /* skb->dev shares skb->rbnode area,
703 * we need to restore its value.
704 */
705 skb->dev = qdisc_dev(sch);
706
707 if (q->slot.slot_next) {
708 q->slot.packets_left--;
709 q->slot.bytes_left -= qdisc_pkt_len(skb);
710 if (q->slot.packets_left <= 0 ||
711 q->slot.bytes_left <= 0)
712 get_slot_next(q, now);
713 }
714
715 if (q->qdisc) {
716 unsigned int pkt_len = qdisc_pkt_len(skb);
717 struct sk_buff *to_free = NULL;
718 int err;
719
720 err = qdisc_enqueue(skb, q->qdisc, &to_free);
721 kfree_skb_list(to_free);
722 if (err != NET_XMIT_SUCCESS &&
723 net_xmit_drop_count(err)) {
724 qdisc_qstats_drop(sch);
725 qdisc_tree_reduce_backlog(sch, 1,
726 pkt_len);
727 }
728 goto tfifo_dequeue;
729 }
730 goto deliver;
731 }
732
733 if (q->qdisc) {
734 skb = q->qdisc->ops->dequeue(q->qdisc);
735 if (skb)
736 goto deliver;
737 }
738
739 qdisc_watchdog_schedule_ns(&q->watchdog,
740 max(time_to_send,
741 q->slot.slot_next));
742 }
743
744 if (q->qdisc) {
745 skb = q->qdisc->ops->dequeue(q->qdisc);
746 if (skb)
747 goto deliver;
748 }
749 return NULL;
750 }
751
752 static void netem_reset(struct Qdisc *sch)
753 {
754 struct netem_sched_data *q = qdisc_priv(sch);
755
756 qdisc_reset_queue(sch);
757 tfifo_reset(sch);
758 if (q->qdisc)
759 qdisc_reset(q->qdisc);
760 qdisc_watchdog_cancel(&q->watchdog);
761 }
762
763 static void dist_free(struct disttable *d)
764 {
765 kvfree(d);
766 }
767
768 /*
769 * Distribution data is a variable size payload containing
770 * signed 16 bit values.
771 */
772
773 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
774 const struct nlattr *attr)
775 {
776 size_t n = nla_len(attr)/sizeof(__s16);
777 const __s16 *data = nla_data(attr);
778 spinlock_t *root_lock;
779 struct disttable *d;
780 int i;
781
782 if (n > NETEM_DIST_MAX)
783 return -EINVAL;
784
785 d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
786 if (!d)
787 return -ENOMEM;
788
789 d->size = n;
790 for (i = 0; i < n; i++)
791 d->table[i] = data[i];
792
793 root_lock = qdisc_root_sleeping_lock(sch);
794
795 spin_lock_bh(root_lock);
796 swap(*tbl, d);
797 spin_unlock_bh(root_lock);
798
799 dist_free(d);
800 return 0;
801 }
802
803 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
804 {
805 const struct tc_netem_slot *c = nla_data(attr);
806
807 q->slot_config = *c;
808 if (q->slot_config.max_packets == 0)
809 q->slot_config.max_packets = INT_MAX;
810 if (q->slot_config.max_bytes == 0)
811 q->slot_config.max_bytes = INT_MAX;
812 q->slot.packets_left = q->slot_config.max_packets;
813 q->slot.bytes_left = q->slot_config.max_bytes;
814 if (q->slot_config.min_delay | q->slot_config.max_delay |
815 q->slot_config.dist_jitter)
816 q->slot.slot_next = ktime_get_ns();
817 else
818 q->slot.slot_next = 0;
819 }
820
821 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
822 {
823 const struct tc_netem_corr *c = nla_data(attr);
824
825 init_crandom(&q->delay_cor, c->delay_corr);
826 init_crandom(&q->loss_cor, c->loss_corr);
827 init_crandom(&q->dup_cor, c->dup_corr);
828 }
829
830 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
831 {
832 const struct tc_netem_reorder *r = nla_data(attr);
833
834 q->reorder = r->probability;
835 init_crandom(&q->reorder_cor, r->correlation);
836 }
837
838 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
839 {
840 const struct tc_netem_corrupt *r = nla_data(attr);
841
842 q->corrupt = r->probability;
843 init_crandom(&q->corrupt_cor, r->correlation);
844 }
845
846 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
847 {
848 const struct tc_netem_rate *r = nla_data(attr);
849
850 q->rate = r->rate;
851 q->packet_overhead = r->packet_overhead;
852 q->cell_size = r->cell_size;
853 q->cell_overhead = r->cell_overhead;
854 if (q->cell_size)
855 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
856 else
857 q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
858 }
859
860 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
861 {
862 const struct nlattr *la;
863 int rem;
864
865 nla_for_each_nested(la, attr, rem) {
866 u16 type = nla_type(la);
867
868 switch (type) {
869 case NETEM_LOSS_GI: {
870 const struct tc_netem_gimodel *gi = nla_data(la);
871
872 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
873 pr_info("netem: incorrect gi model size\n");
874 return -EINVAL;
875 }
876
877 q->loss_model = CLG_4_STATES;
878
879 q->clg.state = TX_IN_GAP_PERIOD;
880 q->clg.a1 = gi->p13;
881 q->clg.a2 = gi->p31;
882 q->clg.a3 = gi->p32;
883 q->clg.a4 = gi->p14;
884 q->clg.a5 = gi->p23;
885 break;
886 }
887
888 case NETEM_LOSS_GE: {
889 const struct tc_netem_gemodel *ge = nla_data(la);
890
891 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
892 pr_info("netem: incorrect ge model size\n");
893 return -EINVAL;
894 }
895
896 q->loss_model = CLG_GILB_ELL;
897 q->clg.state = GOOD_STATE;
898 q->clg.a1 = ge->p;
899 q->clg.a2 = ge->r;
900 q->clg.a3 = ge->h;
901 q->clg.a4 = ge->k1;
902 break;
903 }
904
905 default:
906 pr_info("netem: unknown loss type %u\n", type);
907 return -EINVAL;
908 }
909 }
910
911 return 0;
912 }
913
914 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
915 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
916 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
917 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
918 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
919 [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
920 [TCA_NETEM_ECN] = { .type = NLA_U32 },
921 [TCA_NETEM_RATE64] = { .type = NLA_U64 },
922 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
923 [TCA_NETEM_JITTER64] = { .type = NLA_S64 },
924 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
925 };
926
927 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
928 const struct nla_policy *policy, int len)
929 {
930 int nested_len = nla_len(nla) - NLA_ALIGN(len);
931
932 if (nested_len < 0) {
933 pr_info("netem: invalid attributes len %d\n", nested_len);
934 return -EINVAL;
935 }
936
937 if (nested_len >= nla_attr_size(0))
938 return nla_parse_deprecated(tb, maxtype,
939 nla_data(nla) + NLA_ALIGN(len),
940 nested_len, policy, NULL);
941
942 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
943 return 0;
944 }
945
946 /* Parse netlink message to set options */
947 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
948 struct netlink_ext_ack *extack)
949 {
950 struct netem_sched_data *q = qdisc_priv(sch);
951 struct nlattr *tb[TCA_NETEM_MAX + 1];
952 struct tc_netem_qopt *qopt;
953 struct clgstate old_clg;
954 int old_loss_model = CLG_RANDOM;
955 int ret;
956
957 if (opt == NULL)
958 return -EINVAL;
959
960 qopt = nla_data(opt);
961 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
962 if (ret < 0)
963 return ret;
964
965 /* backup q->clg and q->loss_model */
966 old_clg = q->clg;
967 old_loss_model = q->loss_model;
968
969 if (tb[TCA_NETEM_LOSS]) {
970 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
971 if (ret) {
972 q->loss_model = old_loss_model;
973 return ret;
974 }
975 } else {
976 q->loss_model = CLG_RANDOM;
977 }
978
979 if (tb[TCA_NETEM_DELAY_DIST]) {
980 ret = get_dist_table(sch, &q->delay_dist,
981 tb[TCA_NETEM_DELAY_DIST]);
982 if (ret)
983 goto get_table_failure;
984 }
985
986 if (tb[TCA_NETEM_SLOT_DIST]) {
987 ret = get_dist_table(sch, &q->slot_dist,
988 tb[TCA_NETEM_SLOT_DIST]);
989 if (ret)
990 goto get_table_failure;
991 }
992
993 sch->limit = qopt->limit;
994
995 q->latency = PSCHED_TICKS2NS(qopt->latency);
996 q->jitter = PSCHED_TICKS2NS(qopt->jitter);
997 q->limit = qopt->limit;
998 q->gap = qopt->gap;
999 q->counter = 0;
1000 q->loss = qopt->loss;
1001 q->duplicate = qopt->duplicate;
1002
1003 /* for compatibility with earlier versions.
1004 * if gap is set, need to assume 100% probability
1005 */
1006 if (q->gap)
1007 q->reorder = ~0;
1008
1009 if (tb[TCA_NETEM_CORR])
1010 get_correlation(q, tb[TCA_NETEM_CORR]);
1011
1012 if (tb[TCA_NETEM_REORDER])
1013 get_reorder(q, tb[TCA_NETEM_REORDER]);
1014
1015 if (tb[TCA_NETEM_CORRUPT])
1016 get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1017
1018 if (tb[TCA_NETEM_RATE])
1019 get_rate(q, tb[TCA_NETEM_RATE]);
1020
1021 if (tb[TCA_NETEM_RATE64])
1022 q->rate = max_t(u64, q->rate,
1023 nla_get_u64(tb[TCA_NETEM_RATE64]));
1024
1025 if (tb[TCA_NETEM_LATENCY64])
1026 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1027
1028 if (tb[TCA_NETEM_JITTER64])
1029 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1030
1031 if (tb[TCA_NETEM_ECN])
1032 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1033
1034 if (tb[TCA_NETEM_SLOT])
1035 get_slot(q, tb[TCA_NETEM_SLOT]);
1036
1037 return ret;
1038
1039 get_table_failure:
1040 /* recover clg and loss_model, in case of
1041 * q->clg and q->loss_model were modified
1042 * in get_loss_clg()
1043 */
1044 q->clg = old_clg;
1045 q->loss_model = old_loss_model;
1046 return ret;
1047 }
1048
1049 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1050 struct netlink_ext_ack *extack)
1051 {
1052 struct netem_sched_data *q = qdisc_priv(sch);
1053 int ret;
1054
1055 qdisc_watchdog_init(&q->watchdog, sch);
1056
1057 if (!opt)
1058 return -EINVAL;
1059
1060 q->loss_model = CLG_RANDOM;
1061 ret = netem_change(sch, opt, extack);
1062 if (ret)
1063 pr_info("netem: change failed\n");
1064 return ret;
1065 }
1066
1067 static void netem_destroy(struct Qdisc *sch)
1068 {
1069 struct netem_sched_data *q = qdisc_priv(sch);
1070
1071 qdisc_watchdog_cancel(&q->watchdog);
1072 if (q->qdisc)
1073 qdisc_put(q->qdisc);
1074 dist_free(q->delay_dist);
1075 dist_free(q->slot_dist);
1076 }
1077
1078 static int dump_loss_model(const struct netem_sched_data *q,
1079 struct sk_buff *skb)
1080 {
1081 struct nlattr *nest;
1082
1083 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1084 if (nest == NULL)
1085 goto nla_put_failure;
1086
1087 switch (q->loss_model) {
1088 case CLG_RANDOM:
1089 /* legacy loss model */
1090 nla_nest_cancel(skb, nest);
1091 return 0; /* no data */
1092
1093 case CLG_4_STATES: {
1094 struct tc_netem_gimodel gi = {
1095 .p13 = q->clg.a1,
1096 .p31 = q->clg.a2,
1097 .p32 = q->clg.a3,
1098 .p14 = q->clg.a4,
1099 .p23 = q->clg.a5,
1100 };
1101
1102 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1103 goto nla_put_failure;
1104 break;
1105 }
1106 case CLG_GILB_ELL: {
1107 struct tc_netem_gemodel ge = {
1108 .p = q->clg.a1,
1109 .r = q->clg.a2,
1110 .h = q->clg.a3,
1111 .k1 = q->clg.a4,
1112 };
1113
1114 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1115 goto nla_put_failure;
1116 break;
1117 }
1118 }
1119
1120 nla_nest_end(skb, nest);
1121 return 0;
1122
1123 nla_put_failure:
1124 nla_nest_cancel(skb, nest);
1125 return -1;
1126 }
1127
1128 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1129 {
1130 const struct netem_sched_data *q = qdisc_priv(sch);
1131 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1132 struct tc_netem_qopt qopt;
1133 struct tc_netem_corr cor;
1134 struct tc_netem_reorder reorder;
1135 struct tc_netem_corrupt corrupt;
1136 struct tc_netem_rate rate;
1137 struct tc_netem_slot slot;
1138
1139 qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1140 UINT_MAX);
1141 qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1142 UINT_MAX);
1143 qopt.limit = q->limit;
1144 qopt.loss = q->loss;
1145 qopt.gap = q->gap;
1146 qopt.duplicate = q->duplicate;
1147 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1148 goto nla_put_failure;
1149
1150 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1151 goto nla_put_failure;
1152
1153 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1154 goto nla_put_failure;
1155
1156 cor.delay_corr = q->delay_cor.rho;
1157 cor.loss_corr = q->loss_cor.rho;
1158 cor.dup_corr = q->dup_cor.rho;
1159 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1160 goto nla_put_failure;
1161
1162 reorder.probability = q->reorder;
1163 reorder.correlation = q->reorder_cor.rho;
1164 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1165 goto nla_put_failure;
1166
1167 corrupt.probability = q->corrupt;
1168 corrupt.correlation = q->corrupt_cor.rho;
1169 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1170 goto nla_put_failure;
1171
1172 if (q->rate >= (1ULL << 32)) {
1173 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1174 TCA_NETEM_PAD))
1175 goto nla_put_failure;
1176 rate.rate = ~0U;
1177 } else {
1178 rate.rate = q->rate;
1179 }
1180 rate.packet_overhead = q->packet_overhead;
1181 rate.cell_size = q->cell_size;
1182 rate.cell_overhead = q->cell_overhead;
1183 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1184 goto nla_put_failure;
1185
1186 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1187 goto nla_put_failure;
1188
1189 if (dump_loss_model(q, skb) != 0)
1190 goto nla_put_failure;
1191
1192 if (q->slot_config.min_delay | q->slot_config.max_delay |
1193 q->slot_config.dist_jitter) {
1194 slot = q->slot_config;
1195 if (slot.max_packets == INT_MAX)
1196 slot.max_packets = 0;
1197 if (slot.max_bytes == INT_MAX)
1198 slot.max_bytes = 0;
1199 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1200 goto nla_put_failure;
1201 }
1202
1203 return nla_nest_end(skb, nla);
1204
1205 nla_put_failure:
1206 nlmsg_trim(skb, nla);
1207 return -1;
1208 }
1209
1210 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1211 struct sk_buff *skb, struct tcmsg *tcm)
1212 {
1213 struct netem_sched_data *q = qdisc_priv(sch);
1214
1215 if (cl != 1 || !q->qdisc) /* only one class */
1216 return -ENOENT;
1217
1218 tcm->tcm_handle |= TC_H_MIN(1);
1219 tcm->tcm_info = q->qdisc->handle;
1220
1221 return 0;
1222 }
1223
1224 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1225 struct Qdisc **old, struct netlink_ext_ack *extack)
1226 {
1227 struct netem_sched_data *q = qdisc_priv(sch);
1228
1229 *old = qdisc_replace(sch, new, &q->qdisc);
1230 return 0;
1231 }
1232
1233 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1234 {
1235 struct netem_sched_data *q = qdisc_priv(sch);
1236 return q->qdisc;
1237 }
1238
1239 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1240 {
1241 return 1;
1242 }
1243
1244 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1245 {
1246 if (!walker->stop) {
1247 if (walker->count >= walker->skip)
1248 if (walker->fn(sch, 1, walker) < 0) {
1249 walker->stop = 1;
1250 return;
1251 }
1252 walker->count++;
1253 }
1254 }
1255
1256 static const struct Qdisc_class_ops netem_class_ops = {
1257 .graft = netem_graft,
1258 .leaf = netem_leaf,
1259 .find = netem_find,
1260 .walk = netem_walk,
1261 .dump = netem_dump_class,
1262 };
1263
1264 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1265 .id = "netem",
1266 .cl_ops = &netem_class_ops,
1267 .priv_size = sizeof(struct netem_sched_data),
1268 .enqueue = netem_enqueue,
1269 .dequeue = netem_dequeue,
1270 .peek = qdisc_peek_dequeued,
1271 .init = netem_init,
1272 .reset = netem_reset,
1273 .destroy = netem_destroy,
1274 .change = netem_change,
1275 .dump = netem_dump,
1276 .owner = THIS_MODULE,
1277 };
1278
1279
1280 static int __init netem_module_init(void)
1281 {
1282 pr_info("netem: version " VERSION "\n");
1283 return register_qdisc(&netem_qdisc_ops);
1284 }
1285 static void __exit netem_module_exit(void)
1286 {
1287 unregister_qdisc(&netem_qdisc_ops);
1288 }
1289 module_init(netem_module_init)
1290 module_exit(netem_module_exit)
1291 MODULE_LICENSE("GPL");