]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/sched/sch_sfq.c
Merge tag 'xfs-4.12-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_sfq.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
1da177e4 12#include <linux/module.h>
1da177e4
LT
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
1da177e4
LT
17#include <linux/in.h>
18#include <linux/errno.h>
1da177e4 19#include <linux/init.h>
1da177e4 20#include <linux/skbuff.h>
32740ddc 21#include <linux/jhash.h>
5a0e3ad6 22#include <linux/slab.h>
817fb15d 23#include <linux/vmalloc.h>
0ba48053 24#include <net/netlink.h>
1da177e4 25#include <net/pkt_sched.h>
cf1facda 26#include <net/pkt_cls.h>
ddecf0f4 27#include <net/red.h>
1da177e4
LT
28
29
30/* Stochastic Fairness Queuing algorithm.
31 =======================================
32
33 Source:
34 Paul E. McKenney "Stochastic Fairness Queuing",
35 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36
37 Paul E. McKenney "Stochastic Fairness Queuing",
38 "Interworking: Research and Experience", v.2, 1991, p.113-131.
39
40
41 See also:
42 M. Shreedhar and George Varghese "Efficient Fair
43 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44
45
10297b99 46 This is not the thing that is usually called (W)FQ nowadays.
1da177e4
LT
47 It does not use any timestamp mechanism, but instead
48 processes queues in round-robin order.
49
50 ADVANTAGE:
51
52 - It is very cheap. Both CPU and memory requirements are minimal.
53
54 DRAWBACKS:
55
10297b99 56 - "Stochastic" -> It is not 100% fair.
1da177e4
LT
57 When hash collisions occur, several flows are considered as one.
58
59 - "Round-robin" -> It introduces larger delays than virtual clock
60 based schemes, and should not be used for isolating interactive
61 traffic from non-interactive. It means, that this scheduler
62 should be used as leaf of CBQ or P3, which put interactive traffic
63 to higher priority band.
64
65 We still need true WFQ for top level CSZ, but using WFQ
66 for the best effort traffic is absolutely pointless:
67 SFQ is superior for this purpose.
68
69 IMPLEMENTATION:
18cb8098
ED
70 This implementation limits :
71 - maximal queue length per flow to 127 packets.
72 - max mtu to 2^18-1;
73 - max 65408 flows,
74 - number of hash buckets to 65536.
1da177e4
LT
75
76 It is easy to increase these values, but not in flight. */
77
18cb8098
ED
78#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
79#define SFQ_DEFAULT_FLOWS 128
80#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81#define SFQ_EMPTY_SLOT 0xffff
817fb15d
ED
82#define SFQ_DEFAULT_HASH_DIVISOR 1024
83
eeaeb068
ED
84/* We use 16 bits to store allot, and want to handle packets up to 64K
85 * Scale allot by 8 (1<<3) so that no overflow occurs.
86 */
87#define SFQ_ALLOT_SHIFT 3
88#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
1da177e4 89
18cb8098
ED
90/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91typedef u16 sfq_index;
1da177e4 92
eda83e3b
ED
93/*
94 * We dont use pointers to save space.
18cb8098
ED
95 * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96 * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
eda83e3b
ED
97 * are 'pointers' to dep[] array
98 */
cc7ec456 99struct sfq_head {
1da177e4
LT
100 sfq_index next;
101 sfq_index prev;
102};
103
eda83e3b
ED
104struct sfq_slot {
105 struct sk_buff *skblist_next;
106 struct sk_buff *skblist_prev;
107 sfq_index qlen; /* number of skbs in skblist */
18cb8098 108 sfq_index next; /* next slot in sfq RR chain */
eda83e3b
ED
109 struct sfq_head dep; /* anchor in dep[] chains */
110 unsigned short hash; /* hash value (index in ht[]) */
111 short allot; /* credit for this slot */
ddecf0f4
ED
112
113 unsigned int backlog;
114 struct red_vars vars;
eda83e3b
ED
115};
116
cc7ec456 117struct sfq_sched_data {
18cb8098
ED
118/* frequently used fields */
119 int limit; /* limit of total number of packets in this qdisc */
817fb15d 120 unsigned int divisor; /* number of slots in hash table */
ddecf0f4
ED
121 u8 headdrop;
122 u8 maxdepth; /* limit of packets per flow */
18cb8098 123
32740ddc 124 u32 perturbation;
ddecf0f4
ED
125 u8 cur_depth; /* depth of longest slot */
126 u8 flags;
eeaeb068 127 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
25d8c0d5 128 struct tcf_proto __rcu *filter_list;
18cb8098
ED
129 sfq_index *ht; /* Hash table ('divisor' slots) */
130 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
131
ddecf0f4
ED
132 struct red_parms *red_parms;
133 struct tc_sfqred_stats stats;
134 struct sfq_slot *tail; /* current slot in round */
135
18cb8098
ED
136 struct sfq_head dep[SFQ_MAX_DEPTH + 1];
137 /* Linked lists of slots, indexed by depth
138 * dep[0] : list of unused flows
139 * dep[1] : list of flows with 1 packet
140 * dep[X] : list of flows with X packets
141 */
142
ddecf0f4 143 unsigned int maxflows; /* number of flows in flows array */
18cb8098
ED
144 int perturb_period;
145 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
146 struct timer_list perturb_timer;
1da177e4
LT
147};
148
eda83e3b
ED
149/*
150 * sfq_head are either in a sfq_slot or in dep[] array
151 */
152static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153{
18cb8098 154 if (val < SFQ_MAX_FLOWS)
eda83e3b 155 return &q->slots[val].dep;
18cb8098 156 return &q->dep[val - SFQ_MAX_FLOWS];
eda83e3b
ED
157}
158
11fca931
ED
159static unsigned int sfq_hash(const struct sfq_sched_data *q,
160 const struct sk_buff *skb)
1da177e4 161{
ada1dba0 162 return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
1da177e4
LT
163}
164
7d2681a6
PM
165static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
166 int *qerr)
167{
168 struct sfq_sched_data *q = qdisc_priv(sch);
169 struct tcf_result res;
25d8c0d5 170 struct tcf_proto *fl;
7d2681a6
PM
171 int result;
172
173 if (TC_H_MAJ(skb->priority) == sch->handle &&
174 TC_H_MIN(skb->priority) > 0 &&
817fb15d 175 TC_H_MIN(skb->priority) <= q->divisor)
7d2681a6
PM
176 return TC_H_MIN(skb->priority);
177
25d8c0d5 178 fl = rcu_dereference_bh(q->filter_list);
ada1dba0 179 if (!fl)
7d2681a6
PM
180 return sfq_hash(q, skb) + 1;
181
c27f339a 182 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
3b3ae880 183 result = tc_classify(skb, fl, &res, false);
7d2681a6
PM
184 if (result >= 0) {
185#ifdef CONFIG_NET_CLS_ACT
186 switch (result) {
187 case TC_ACT_STOLEN:
188 case TC_ACT_QUEUED:
378a2f09 189 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
7d2681a6
PM
190 case TC_ACT_SHOT:
191 return 0;
192 }
193#endif
817fb15d 194 if (TC_H_MIN(res.classid) <= q->divisor)
7d2681a6
PM
195 return TC_H_MIN(res.classid);
196 }
197 return 0;
198}
199
eda83e3b 200/*
18cb8098 201 * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
eda83e3b 202 */
1da177e4
LT
203static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204{
205 sfq_index p, n;
18cb8098
ED
206 struct sfq_slot *slot = &q->slots[x];
207 int qlen = slot->qlen;
eda83e3b 208
18cb8098 209 p = qlen + SFQ_MAX_FLOWS;
eda83e3b 210 n = q->dep[qlen].next;
1da177e4 211
18cb8098
ED
212 slot->dep.next = n;
213 slot->dep.prev = p;
eda83e3b
ED
214
215 q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
216 sfq_dep_head(q, n)->prev = x;
1da177e4
LT
217}
218
eda83e3b 219#define sfq_unlink(q, x, n, p) \
fa08943b
YY
220 do { \
221 n = q->slots[x].dep.next; \
222 p = q->slots[x].dep.prev; \
223 sfq_dep_head(q, p)->next = n; \
224 sfq_dep_head(q, n)->prev = p; \
225 } while (0)
eda83e3b
ED
226
227
1da177e4
LT
228static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
229{
230 sfq_index p, n;
eda83e3b 231 int d;
1da177e4 232
eda83e3b 233 sfq_unlink(q, x, n, p);
1da177e4 234
eda83e3b
ED
235 d = q->slots[x].qlen--;
236 if (n == p && q->cur_depth == d)
237 q->cur_depth--;
1da177e4
LT
238 sfq_link(q, x);
239}
240
241static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
242{
243 sfq_index p, n;
244 int d;
245
eda83e3b 246 sfq_unlink(q, x, n, p);
1da177e4 247
eda83e3b
ED
248 d = ++q->slots[x].qlen;
249 if (q->cur_depth < d)
250 q->cur_depth = d;
1da177e4
LT
251 sfq_link(q, x);
252}
253
eda83e3b
ED
254/* helper functions : might be changed when/if skb use a standard list_head */
255
256/* remove one skb from tail of slot queue */
257static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
258{
259 struct sk_buff *skb = slot->skblist_prev;
260
261 slot->skblist_prev = skb->prev;
ee09b3c1 262 skb->prev->next = (struct sk_buff *)slot;
eda83e3b
ED
263 skb->next = skb->prev = NULL;
264 return skb;
265}
266
267/* remove one skb from head of slot queue */
268static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
269{
270 struct sk_buff *skb = slot->skblist_next;
271
272 slot->skblist_next = skb->next;
18c8d82a 273 skb->next->prev = (struct sk_buff *)slot;
eda83e3b
ED
274 skb->next = skb->prev = NULL;
275 return skb;
276}
277
278static inline void slot_queue_init(struct sfq_slot *slot)
279{
18cb8098 280 memset(slot, 0, sizeof(*slot));
eda83e3b
ED
281 slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
282}
283
284/* add skb to slot queue (tail add) */
285static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
286{
287 skb->prev = slot->skblist_prev;
288 skb->next = (struct sk_buff *)slot;
289 slot->skblist_prev->next = skb;
290 slot->skblist_prev = skb;
291}
292
1da177e4
LT
293static unsigned int sfq_drop(struct Qdisc *sch)
294{
295 struct sfq_sched_data *q = qdisc_priv(sch);
eda83e3b 296 sfq_index x, d = q->cur_depth;
1da177e4
LT
297 struct sk_buff *skb;
298 unsigned int len;
eda83e3b 299 struct sfq_slot *slot;
1da177e4 300
eda83e3b 301 /* Queue is full! Find the longest slot and drop tail packet from it */
1da177e4 302 if (d > 1) {
eda83e3b
ED
303 x = q->dep[d].next;
304 slot = &q->slots[x];
305drop:
18cb8098 306 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
0abf77e5 307 len = qdisc_pkt_len(skb);
ddecf0f4 308 slot->backlog -= len;
1da177e4
LT
309 sfq_dec(q, x);
310 sch->q.qlen--;
25331d6c
JF
311 qdisc_qstats_drop(sch);
312 qdisc_qstats_backlog_dec(sch, skb);
e8d092aa 313 kfree_skb(skb);
1da177e4
LT
314 return len;
315 }
316
317 if (d == 1) {
318 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
eda83e3b
ED
319 x = q->tail->next;
320 slot = &q->slots[x];
321 q->tail->next = slot->next;
322 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
323 goto drop;
1da177e4
LT
324 }
325
326 return 0;
327}
328
ddecf0f4
ED
329/* Is ECN parameter configured */
330static int sfq_prob_mark(const struct sfq_sched_data *q)
331{
332 return q->flags & TC_RED_ECN;
333}
334
335/* Should packets over max threshold just be marked */
336static int sfq_hard_mark(const struct sfq_sched_data *q)
337{
338 return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
339}
340
341static int sfq_headdrop(const struct sfq_sched_data *q)
342{
343 return q->headdrop;
344}
345
1da177e4 346static int
520ac30f 347sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
1da177e4
LT
348{
349 struct sfq_sched_data *q = qdisc_priv(sch);
2ccccf5f 350 unsigned int hash, dropped;
8efa8854 351 sfq_index x, qlen;
eda83e3b 352 struct sfq_slot *slot;
7f3ff4f6 353 int uninitialized_var(ret);
ddecf0f4
ED
354 struct sk_buff *head;
355 int delta;
7d2681a6
PM
356
357 hash = sfq_classify(skb, sch, &ret);
358 if (hash == 0) {
c27f339a 359 if (ret & __NET_XMIT_BYPASS)
25331d6c 360 qdisc_qstats_drop(sch);
7d2681a6
PM
361 kfree_skb(skb);
362 return ret;
363 }
364 hash--;
1da177e4
LT
365
366 x = q->ht[hash];
eda83e3b
ED
367 slot = &q->slots[x];
368 if (x == SFQ_EMPTY_SLOT) {
369 x = q->dep[0].next; /* get a free slot */
18cb8098 370 if (x >= SFQ_MAX_FLOWS)
520ac30f 371 return qdisc_drop(skb, sch, to_free);
eda83e3b
ED
372 q->ht[hash] = x;
373 slot = &q->slots[x];
374 slot->hash = hash;
ddecf0f4
ED
375 slot->backlog = 0; /* should already be 0 anyway... */
376 red_set_vars(&slot->vars);
377 goto enqueue;
378 }
379 if (q->red_parms) {
380 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
381 &slot->vars,
382 slot->backlog);
383 switch (red_action(q->red_parms,
384 &slot->vars,
385 slot->vars.qavg)) {
386 case RED_DONT_MARK:
387 break;
388
389 case RED_PROB_MARK:
25331d6c 390 qdisc_qstats_overlimit(sch);
ddecf0f4
ED
391 if (sfq_prob_mark(q)) {
392 /* We know we have at least one packet in queue */
393 if (sfq_headdrop(q) &&
394 INET_ECN_set_ce(slot->skblist_next)) {
395 q->stats.prob_mark_head++;
396 break;
397 }
398 if (INET_ECN_set_ce(skb)) {
399 q->stats.prob_mark++;
400 break;
401 }
402 }
403 q->stats.prob_drop++;
404 goto congestion_drop;
405
406 case RED_HARD_MARK:
25331d6c 407 qdisc_qstats_overlimit(sch);
ddecf0f4
ED
408 if (sfq_hard_mark(q)) {
409 /* We know we have at least one packet in queue */
410 if (sfq_headdrop(q) &&
411 INET_ECN_set_ce(slot->skblist_next)) {
412 q->stats.forced_mark_head++;
413 break;
414 }
415 if (INET_ECN_set_ce(skb)) {
416 q->stats.forced_mark++;
417 break;
418 }
419 }
420 q->stats.forced_drop++;
421 goto congestion_drop;
422 }
1da177e4 423 }
6f9e98f7 424
18cb8098 425 if (slot->qlen >= q->maxdepth) {
ddecf0f4
ED
426congestion_drop:
427 if (!sfq_headdrop(q))
520ac30f 428 return qdisc_drop(skb, sch, to_free);
18cb8098 429
ddecf0f4 430 /* We know we have at least one packet in queue */
18cb8098 431 head = slot_dequeue_head(slot);
ddecf0f4
ED
432 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
433 sch->qstats.backlog -= delta;
434 slot->backlog -= delta;
520ac30f 435 qdisc_drop(head, sch, to_free);
18cb8098 436
18cb8098
ED
437 slot_queue_add(slot, skb);
438 return NET_XMIT_CN;
439 }
32740ddc 440
ddecf0f4 441enqueue:
25331d6c 442 qdisc_qstats_backlog_inc(sch, skb);
ddecf0f4 443 slot->backlog += qdisc_pkt_len(skb);
eda83e3b 444 slot_queue_add(slot, skb);
1da177e4 445 sfq_inc(q, x);
eda83e3b
ED
446 if (slot->qlen == 1) { /* The flow is new */
447 if (q->tail == NULL) { /* It is the first flow */
448 slot->next = x;
1da177e4 449 } else {
eda83e3b
ED
450 slot->next = q->tail->next;
451 q->tail->next = x;
1da177e4 452 }
cc34eb67
ED
453 /* We put this flow at the end of our flow list.
454 * This might sound unfair for a new flow to wait after old ones,
455 * but we could endup servicing new flows only, and freeze old ones.
456 */
457 q->tail = slot;
ddecf0f4 458 /* We could use a bigger initial quantum for new flows */
eeaeb068 459 slot->allot = q->scaled_quantum;
1da177e4 460 }
9190b3b3 461 if (++sch->q.qlen <= q->limit)
9871e50e 462 return NET_XMIT_SUCCESS;
1da177e4 463
8efa8854 464 qlen = slot->qlen;
2ccccf5f 465 dropped = sfq_drop(sch);
8efa8854
ED
466 /* Return Congestion Notification only if we dropped a packet
467 * from this flow.
468 */
e1738bd9
ED
469 if (qlen != slot->qlen)
470 return NET_XMIT_CN;
471
472 /* As we dropped a packet, better let upper stack know this */
2ccccf5f 473 qdisc_tree_reduce_backlog(sch, 1, dropped);
e1738bd9 474 return NET_XMIT_SUCCESS;
1da177e4
LT
475}
476
1da177e4 477static struct sk_buff *
6f9e98f7 478sfq_dequeue(struct Qdisc *sch)
1da177e4
LT
479{
480 struct sfq_sched_data *q = qdisc_priv(sch);
481 struct sk_buff *skb;
aa3e2199 482 sfq_index a, next_a;
eda83e3b 483 struct sfq_slot *slot;
1da177e4
LT
484
485 /* No active slots */
eda83e3b 486 if (q->tail == NULL)
1da177e4
LT
487 return NULL;
488
eeaeb068 489next_slot:
eda83e3b
ED
490 a = q->tail->next;
491 slot = &q->slots[a];
eeaeb068
ED
492 if (slot->allot <= 0) {
493 q->tail = slot;
494 slot->allot += q->scaled_quantum;
495 goto next_slot;
496 }
eda83e3b 497 skb = slot_dequeue_head(slot);
1da177e4 498 sfq_dec(q, a);
9190b3b3 499 qdisc_bstats_update(sch, skb);
1da177e4 500 sch->q.qlen--;
25331d6c 501 qdisc_qstats_backlog_dec(sch, skb);
ddecf0f4 502 slot->backlog -= qdisc_pkt_len(skb);
1da177e4 503 /* Is the slot empty? */
eda83e3b
ED
504 if (slot->qlen == 0) {
505 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
506 next_a = slot->next;
aa3e2199 507 if (a == next_a) {
eda83e3b 508 q->tail = NULL; /* no more active slots */
1da177e4
LT
509 return skb;
510 }
eda83e3b 511 q->tail->next = next_a;
eeaeb068
ED
512 } else {
513 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
1da177e4
LT
514 }
515 return skb;
516}
517
518static void
6f9e98f7 519sfq_reset(struct Qdisc *sch)
1da177e4
LT
520{
521 struct sk_buff *skb;
522
523 while ((skb = sfq_dequeue(sch)) != NULL)
fea02478 524 rtnl_kfree_skbs(skb, skb);
1da177e4
LT
525}
526
225d9b89
ED
527/*
528 * When q->perturbation is changed, we rehash all queued skbs
529 * to avoid OOO (Out Of Order) effects.
530 * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
531 * counters.
532 */
18cb8098 533static void sfq_rehash(struct Qdisc *sch)
225d9b89 534{
18cb8098 535 struct sfq_sched_data *q = qdisc_priv(sch);
225d9b89
ED
536 struct sk_buff *skb;
537 int i;
538 struct sfq_slot *slot;
539 struct sk_buff_head list;
18cb8098 540 int dropped = 0;
2ccccf5f 541 unsigned int drop_len = 0;
225d9b89
ED
542
543 __skb_queue_head_init(&list);
544
18cb8098 545 for (i = 0; i < q->maxflows; i++) {
225d9b89
ED
546 slot = &q->slots[i];
547 if (!slot->qlen)
548 continue;
549 while (slot->qlen) {
550 skb = slot_dequeue_head(slot);
551 sfq_dec(q, i);
552 __skb_queue_tail(&list, skb);
553 }
ddecf0f4
ED
554 slot->backlog = 0;
555 red_set_vars(&slot->vars);
225d9b89
ED
556 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
557 }
558 q->tail = NULL;
559
560 while ((skb = __skb_dequeue(&list)) != NULL) {
561 unsigned int hash = sfq_hash(q, skb);
562 sfq_index x = q->ht[hash];
563
564 slot = &q->slots[x];
565 if (x == SFQ_EMPTY_SLOT) {
566 x = q->dep[0].next; /* get a free slot */
18cb8098 567 if (x >= SFQ_MAX_FLOWS) {
25331d6c
JF
568drop:
569 qdisc_qstats_backlog_dec(sch, skb);
2ccccf5f 570 drop_len += qdisc_pkt_len(skb);
18cb8098
ED
571 kfree_skb(skb);
572 dropped++;
573 continue;
574 }
225d9b89
ED
575 q->ht[hash] = x;
576 slot = &q->slots[x];
577 slot->hash = hash;
578 }
18cb8098
ED
579 if (slot->qlen >= q->maxdepth)
580 goto drop;
225d9b89 581 slot_queue_add(slot, skb);
ddecf0f4
ED
582 if (q->red_parms)
583 slot->vars.qavg = red_calc_qavg(q->red_parms,
584 &slot->vars,
585 slot->backlog);
586 slot->backlog += qdisc_pkt_len(skb);
225d9b89
ED
587 sfq_inc(q, x);
588 if (slot->qlen == 1) { /* The flow is new */
589 if (q->tail == NULL) { /* It is the first flow */
590 slot->next = x;
591 } else {
592 slot->next = q->tail->next;
593 q->tail->next = x;
594 }
595 q->tail = slot;
596 slot->allot = q->scaled_quantum;
597 }
598 }
18cb8098 599 sch->q.qlen -= dropped;
2ccccf5f 600 qdisc_tree_reduce_backlog(sch, dropped, drop_len);
225d9b89
ED
601}
602
1da177e4
LT
603static void sfq_perturbation(unsigned long arg)
604{
6f9e98f7 605 struct Qdisc *sch = (struct Qdisc *)arg;
1da177e4 606 struct sfq_sched_data *q = qdisc_priv(sch);
225d9b89 607 spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
1da177e4 608
225d9b89 609 spin_lock(root_lock);
63862b5b 610 q->perturbation = prandom_u32();
225d9b89 611 if (!q->filter_list && q->tail)
18cb8098 612 sfq_rehash(sch);
225d9b89 613 spin_unlock(root_lock);
1da177e4 614
32740ddc
AK
615 if (q->perturb_period)
616 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
1da177e4
LT
617}
618
1e90474c 619static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
1da177e4
LT
620{
621 struct sfq_sched_data *q = qdisc_priv(sch);
1e90474c 622 struct tc_sfq_qopt *ctl = nla_data(opt);
18cb8098 623 struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
2ccccf5f 624 unsigned int qlen, dropped = 0;
ddecf0f4 625 struct red_parms *p = NULL;
1da177e4 626
1e90474c 627 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
1da177e4 628 return -EINVAL;
18cb8098
ED
629 if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
630 ctl_v1 = nla_data(opt);
119b3d38 631 if (ctl->divisor &&
632 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
633 return -EINVAL;
ddecf0f4
ED
634 if (ctl_v1 && ctl_v1->qth_min) {
635 p = kmalloc(sizeof(*p), GFP_KERNEL);
636 if (!p)
637 return -ENOMEM;
638 }
1da177e4 639 sch_tree_lock(sch);
18cb8098
ED
640 if (ctl->quantum) {
641 q->quantum = ctl->quantum;
642 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
643 }
6f9e98f7 644 q->perturb_period = ctl->perturb_period * HZ;
18cb8098
ED
645 if (ctl->flows)
646 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
647 if (ctl->divisor) {
817fb15d 648 q->divisor = ctl->divisor;
18cb8098
ED
649 q->maxflows = min_t(u32, q->maxflows, q->divisor);
650 }
651 if (ctl_v1) {
652 if (ctl_v1->depth)
653 q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
ddecf0f4
ED
654 if (p) {
655 swap(q->red_parms, p);
656 red_set_parms(q->red_parms,
657 ctl_v1->qth_min, ctl_v1->qth_max,
658 ctl_v1->Wlog,
659 ctl_v1->Plog, ctl_v1->Scell_log,
660 NULL,
661 ctl_v1->max_P);
662 }
663 q->flags = ctl_v1->flags;
18cb8098
ED
664 q->headdrop = ctl_v1->headdrop;
665 }
666 if (ctl->limit) {
667 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
668 q->maxflows = min_t(u32, q->maxflows, q->limit);
669 }
670
5e50da01 671 qlen = sch->q.qlen;
5588b40d 672 while (sch->q.qlen > q->limit)
2ccccf5f
WC
673 dropped += sfq_drop(sch);
674 qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
1da177e4
LT
675
676 del_timer(&q->perturb_timer);
677 if (q->perturb_period) {
32740ddc 678 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
63862b5b 679 q->perturbation = prandom_u32();
1da177e4
LT
680 }
681 sch_tree_unlock(sch);
ddecf0f4 682 kfree(p);
1da177e4
LT
683 return 0;
684}
685
bd16a6cc
ED
686static void *sfq_alloc(size_t sz)
687{
688 void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
689
690 if (!ptr)
691 ptr = vmalloc(sz);
692 return ptr;
693}
694
695static void sfq_free(void *addr)
696{
4cb28970 697 kvfree(addr);
bd16a6cc
ED
698}
699
700static void sfq_destroy(struct Qdisc *sch)
701{
702 struct sfq_sched_data *q = qdisc_priv(sch);
703
704 tcf_destroy_chain(&q->filter_list);
705 q->perturb_period = 0;
706 del_timer_sync(&q->perturb_timer);
707 sfq_free(q->ht);
18cb8098 708 sfq_free(q->slots);
ddecf0f4 709 kfree(q->red_parms);
bd16a6cc
ED
710}
711
1e90474c 712static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
1da177e4
LT
713{
714 struct sfq_sched_data *q = qdisc_priv(sch);
715 int i;
716
3b1af93c
GT
717 setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
718 (unsigned long)sch);
1da177e4 719
18cb8098
ED
720 for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
721 q->dep[i].next = i + SFQ_MAX_FLOWS;
722 q->dep[i].prev = i + SFQ_MAX_FLOWS;
1da177e4 723 }
6f9e98f7 724
18cb8098
ED
725 q->limit = SFQ_MAX_DEPTH;
726 q->maxdepth = SFQ_MAX_DEPTH;
eda83e3b
ED
727 q->cur_depth = 0;
728 q->tail = NULL;
817fb15d 729 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
18cb8098 730 q->maxflows = SFQ_DEFAULT_FLOWS;
02a9098e
ED
731 q->quantum = psched_mtu(qdisc_dev(sch));
732 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
733 q->perturb_period = 0;
63862b5b 734 q->perturbation = prandom_u32();
02a9098e
ED
735
736 if (opt) {
1da177e4
LT
737 int err = sfq_change(sch, opt);
738 if (err)
739 return err;
740 }
6f9e98f7 741
bd16a6cc 742 q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
18cb8098
ED
743 q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
744 if (!q->ht || !q->slots) {
87b60cfa 745 /* Note: sfq_destroy() will be called by our caller */
817fb15d 746 return -ENOMEM;
bd16a6cc 747 }
87b60cfa 748
817fb15d
ED
749 for (i = 0; i < q->divisor; i++)
750 q->ht[i] = SFQ_EMPTY_SLOT;
751
18cb8098 752 for (i = 0; i < q->maxflows; i++) {
18c8d82a 753 slot_queue_init(&q->slots[i]);
1da177e4 754 sfq_link(q, i);
18c8d82a 755 }
23624935
ED
756 if (q->limit >= 1)
757 sch->flags |= TCQ_F_CAN_BYPASS;
758 else
759 sch->flags &= ~TCQ_F_CAN_BYPASS;
1da177e4
LT
760 return 0;
761}
762
1da177e4
LT
763static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
764{
765 struct sfq_sched_data *q = qdisc_priv(sch);
27a884dc 766 unsigned char *b = skb_tail_pointer(skb);
18cb8098 767 struct tc_sfq_qopt_v1 opt;
ddecf0f4 768 struct red_parms *p = q->red_parms;
18cb8098
ED
769
770 memset(&opt, 0, sizeof(opt));
771 opt.v0.quantum = q->quantum;
772 opt.v0.perturb_period = q->perturb_period / HZ;
773 opt.v0.limit = q->limit;
774 opt.v0.divisor = q->divisor;
775 opt.v0.flows = q->maxflows;
776 opt.depth = q->maxdepth;
777 opt.headdrop = q->headdrop;
1da177e4 778
ddecf0f4
ED
779 if (p) {
780 opt.qth_min = p->qth_min >> p->Wlog;
781 opt.qth_max = p->qth_max >> p->Wlog;
782 opt.Wlog = p->Wlog;
783 opt.Plog = p->Plog;
784 opt.Scell_log = p->Scell_log;
785 opt.max_P = p->max_P;
786 }
787 memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
788 opt.flags = q->flags;
789
1b34ec43
DM
790 if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
791 goto nla_put_failure;
1da177e4
LT
792
793 return skb->len;
794
1e90474c 795nla_put_failure:
dc5fc579 796 nlmsg_trim(skb, b);
1da177e4
LT
797 return -1;
798}
799
41065fba
JP
800static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
801{
802 return NULL;
803}
804
7d2681a6
PM
805static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
806{
807 return 0;
808}
809
eb4a5527
JP
810static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
811 u32 classid)
812{
23624935
ED
813 /* we cannot bypass queue discipline anymore */
814 sch->flags &= ~TCQ_F_CAN_BYPASS;
eb4a5527
JP
815 return 0;
816}
817
da7115d9
JP
818static void sfq_put(struct Qdisc *q, unsigned long cl)
819{
820}
821
25d8c0d5
JF
822static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
823 unsigned long cl)
7d2681a6
PM
824{
825 struct sfq_sched_data *q = qdisc_priv(sch);
826
827 if (cl)
828 return NULL;
829 return &q->filter_list;
830}
831
94de78d1
PM
832static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
833 struct sk_buff *skb, struct tcmsg *tcm)
834{
835 tcm->tcm_handle |= TC_H_MIN(cl);
836 return 0;
837}
838
839static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
840 struct gnet_dump *d)
841{
842 struct sfq_sched_data *q = qdisc_priv(sch);
ee09b3c1
ED
843 sfq_index idx = q->ht[cl - 1];
844 struct gnet_stats_queue qs = { 0 };
845 struct tc_sfq_xstats xstats = { 0 };
c4266263 846
ee09b3c1
ED
847 if (idx != SFQ_EMPTY_SLOT) {
848 const struct sfq_slot *slot = &q->slots[idx];
94de78d1 849
eeaeb068 850 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
ee09b3c1 851 qs.qlen = slot->qlen;
ddecf0f4 852 qs.backlog = slot->backlog;
ee09b3c1 853 }
b0ab6f92 854 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
94de78d1
PM
855 return -1;
856 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
857}
858
7d2681a6
PM
859static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
860{
94de78d1
PM
861 struct sfq_sched_data *q = qdisc_priv(sch);
862 unsigned int i;
863
864 if (arg->stop)
865 return;
866
817fb15d 867 for (i = 0; i < q->divisor; i++) {
eda83e3b 868 if (q->ht[i] == SFQ_EMPTY_SLOT ||
94de78d1
PM
869 arg->count < arg->skip) {
870 arg->count++;
871 continue;
872 }
873 if (arg->fn(sch, i + 1, arg) < 0) {
874 arg->stop = 1;
875 break;
876 }
877 arg->count++;
878 }
7d2681a6
PM
879}
880
881static const struct Qdisc_class_ops sfq_class_ops = {
41065fba 882 .leaf = sfq_leaf,
7d2681a6 883 .get = sfq_get,
da7115d9 884 .put = sfq_put,
7d2681a6 885 .tcf_chain = sfq_find_tcf,
eb4a5527 886 .bind_tcf = sfq_bind,
da7115d9 887 .unbind_tcf = sfq_put,
94de78d1
PM
888 .dump = sfq_dump_class,
889 .dump_stats = sfq_dump_class_stats,
7d2681a6
PM
890 .walk = sfq_walk,
891};
892
20fea08b 893static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
7d2681a6 894 .cl_ops = &sfq_class_ops,
1da177e4
LT
895 .id = "sfq",
896 .priv_size = sizeof(struct sfq_sched_data),
897 .enqueue = sfq_enqueue,
898 .dequeue = sfq_dequeue,
07bd8df5 899 .peek = qdisc_peek_dequeued,
1da177e4
LT
900 .init = sfq_init,
901 .reset = sfq_reset,
902 .destroy = sfq_destroy,
903 .change = NULL,
904 .dump = sfq_dump,
905 .owner = THIS_MODULE,
906};
907
908static int __init sfq_module_init(void)
909{
910 return register_qdisc(&sfq_qdisc_ops);
911}
10297b99 912static void __exit sfq_module_exit(void)
1da177e4
LT
913{
914 unregister_qdisc(&sfq_qdisc_ops);
915}
916module_init(sfq_module_init)
917module_exit(sfq_module_exit)
918MODULE_LICENSE("GPL");