[mirror_ubuntu-artful-kernel.git] / net / sched / sch_fq.c

/*
 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
 *
 *  Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 *  Meant to be mostly used for localy generated traffic :
 *  Fast classification depends on skb->sk being set before reaching us.
 *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
 *  All packets belonging to a socket are considered as a 'flow'.
 *
 *  Flows are dynamically allocated and stored in a hash table of RB trees
 *  They are also part of one Round Robin 'queues' (new or old flows)
 *
 *  Burst avoidance (aka pacing) capability :
 *
 *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
 *  bunch of packets, and this packet scheduler adds delay between
 *  packets to respect rate limitation.
 *
 *  enqueue() :
 *   - lookup one RB tree (out of 1024 or more) to find the flow.
 *     If non existent flow, create it, add it to the tree.
 *     Add skb to the per flow list of skb (fifo).
 *   - Use a special fifo for high prio packets
 *
 *  dequeue() : serves flows in Round Robin
 *  Note : When a flow becomes empty, we do not immediately remove it from
 *  rb trees, for performance reasons (its expected to send additional packets,
 *  or SLAB cache will reuse socket for another flow)
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/hash.h>
#include <linux/prefetch.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
#include <net/tcp_states.h>

/*
 * Per flow structure, dynamically allocated
 */
struct fq_flow {
	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
	union {
		struct sk_buff *tail;	/* last skb in the list */
		unsigned long  age;	/* jiffies when flow was emptied, for gc */
	};
	struct rb_node	fq_node; 	/* anchor in fq_root[] trees */
	struct sock	*sk;
	int		qlen;		/* number of packets in flow queue */
	int		credit;
	u32		socket_hash;	/* sk_hash */
	struct fq_flow *next;		/* next pointer in RR lists, or &detached */

	struct rb_node  rate_node;	/* anchor in q->delayed tree */
	u64		time_next_packet;
};

struct fq_flow_head {
	struct fq_flow *first;
	struct fq_flow *last;
};

struct fq_sched_data {
	struct fq_flow_head new_flows;

	struct fq_flow_head old_flows;

	struct rb_root	delayed;	/* for rate limited flows */
	u64		time_next_delayed_flow;

	struct fq_flow	internal;	/* for non classified or high prio packets */
	u32		quantum;
	u32		initial_quantum;
	u32		flow_default_rate;/* rate per flow : bytes per second */
	u32		flow_max_rate;	/* optional max rate per flow */
	u32		flow_plimit;	/* max packets per flow */
	struct rb_root	*fq_root;
	u8		rate_enable;
	u8		fq_trees_log;

	u32		flows;
	u32		inactive_flows;
	u32		throttled_flows;

	u64		stat_gc_flows;
	u64		stat_internal_packets;
	u64		stat_tcp_retrans;
	u64		stat_throttled;
	u64		stat_flows_plimit;
	u64		stat_pkts_too_long;
	u64		stat_allocation_errors;
	struct qdisc_watchdog watchdog;
};

/* special value to mark a detached flow (not on old/new list) */
static struct fq_flow detached, throttled;

static void fq_flow_set_detached(struct fq_flow *f)
{
	f->next = &detached;
}

static bool fq_flow_is_detached(const struct fq_flow *f)
{
	return f->next == &detached;
}

static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;

	while (*p) {
		struct fq_flow *aux;

		parent = *p;
		aux = container_of(parent, struct fq_flow, rate_node);
		if (f->time_next_packet >= aux->time_next_packet)
			p = &parent->rb_right;
		else
			p = &parent->rb_left;
	}
	rb_link_node(&f->rate_node, parent, p);
	rb_insert_color(&f->rate_node, &q->delayed);
	q->throttled_flows++;
	q->stat_throttled++;

	f->next = &throttled;
	if (q->time_next_delayed_flow > f->time_next_packet)
		q->time_next_delayed_flow = f->time_next_packet;
}


static struct kmem_cache *fq_flow_cachep __read_mostly;

static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
{
	if (head->first)
		head->last->next = flow;
	else
		head->first = flow;
	head->last = flow;
	flow->next = NULL;
}

/* limit number of collected flows per round */
#define FQ_GC_MAX 8
#define FQ_GC_AGE (3*HZ)

static bool fq_gc_candidate(const struct fq_flow *f)
{
	return fq_flow_is_detached(f) &&
	       time_after(jiffies, f->age + FQ_GC_AGE);
}

static void fq_gc(struct fq_sched_data *q,
		  struct rb_root *root,
		  struct sock *sk)
{
	struct fq_flow *f, *tofree[FQ_GC_MAX];
	struct rb_node **p, *parent;
	int fcnt = 0;

	p = &root->rb_node;
	parent = NULL;
	while (*p) {
		parent = *p;

		f = container_of(parent, struct fq_flow, fq_node);
		if (f->sk == sk)
			break;

		if (fq_gc_candidate(f)) {
			tofree[fcnt++] = f;
			if (fcnt == FQ_GC_MAX)
				break;
		}

		if (f->sk > sk)
			p = &parent->rb_right;
		else
			p = &parent->rb_left;
	}

	q->flows -= fcnt;
	q->inactive_flows -= fcnt;
	q->stat_gc_flows += fcnt;
	while (fcnt) {
		struct fq_flow *f = tofree[--fcnt];

		rb_erase(&f->fq_node, root);
		kmem_cache_free(fq_flow_cachep, f);
	}
}

static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};

static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
{
	struct rb_node **p, *parent;
	struct sock *sk = skb->sk;
	struct rb_root *root;
	struct fq_flow *f;
	int band;

	/* warning: no starvation prevention... */
	band = prio2band[skb->priority & TC_PRIO_MAX];
	if (unlikely(band == 0))
		return &q->internal;

	if (unlikely(!sk)) {
		/* By forcing low order bit to 1, we make sure to not
		 * collide with a local flow (socket pointers are word aligned)
		 */
		sk = (struct sock *)(skb_get_rxhash(skb) | 1L);
	}

	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];

	if (q->flows >= (2U << q->fq_trees_log) &&
	    q->inactive_flows > q->flows/2)
		fq_gc(q, root, sk);

	p = &root->rb_node;
	parent = NULL;
	while (*p) {
		parent = *p;

		f = container_of(parent, struct fq_flow, fq_node);
		if (f->sk == sk) {
			/* socket might have been reallocated, so check
			 * if its sk_hash is the same.
			 * It not, we need to refill credit with
			 * initial quantum
			 */
			if (unlikely(skb->sk &&
				     f->socket_hash != sk->sk_hash)) {
				f->credit = q->initial_quantum;
				f->socket_hash = sk->sk_hash;
			}
			return f;
		}
		if (f->sk > sk)
			p = &parent->rb_right;
		else
			p = &parent->rb_left;
	}

	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
	if (unlikely(!f)) {
		q->stat_allocation_errors++;
		return &q->internal;
	}
	fq_flow_set_detached(f);
	f->sk = sk;
	if (skb->sk)
		f->socket_hash = sk->sk_hash;
	f->credit = q->initial_quantum;

	rb_link_node(&f->fq_node, parent, p);
	rb_insert_color(&f->fq_node, root);

	q->flows++;
	q->inactive_flows++;
	return f;
}


/* remove one skb from head of flow queue */
static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
{
	struct sk_buff *skb = flow->head;

	if (skb) {
		flow->head = skb->next;
		skb->next = NULL;
		flow->qlen--;
	}
	return skb;
}

/* We might add in the future detection of retransmits
 * For the time being, just return false
 */
static bool skb_is_retransmit(struct sk_buff *skb)
{
	return false;
}

/* add skb to flow queue
 * flow queue is a linked list, kind of FIFO, except for TCP retransmits
 * We special case tcp retransmits to be transmitted before other packets.
 * We rely on fact that TCP retransmits are unlikely, so we do not waste
 * a separate queue or a pointer.
 * head->  [retrans pkt 1]
 *         [retrans pkt 2]
 *         [ normal pkt 1]
 *         [ normal pkt 2]
 *         [ normal pkt 3]
 * tail->  [ normal pkt 4]
 */
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{
	struct sk_buff *prev, *head = flow->head;

	skb->next = NULL;
	if (!head) {
		flow->head = skb;
		flow->tail = skb;
		return;
	}
	if (likely(!skb_is_retransmit(skb))) {
		flow->tail->next = skb;
		flow->tail = skb;
		return;
	}

	/* This skb is a tcp retransmit,
	 * find the last retrans packet in the queue
	 */
	prev = NULL;
	while (skb_is_retransmit(head)) {
		prev = head;
		head = head->next;
		if (!head)
			break;
	}
	if (!prev) { /* no rtx packet in queue, become the new head */
		skb->next = flow->head;
		flow->head = skb;
	} else {
		if (prev == flow->tail)
			flow->tail = skb;
		else
			skb->next = prev->next;
		prev->next = skb;
	}
}

static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	struct fq_sched_data *q = qdisc_priv(sch);
	struct fq_flow *f;

	if (unlikely(sch->q.qlen >= sch->limit))
		return qdisc_drop(skb, sch);

	f = fq_classify(skb, q);
	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
		q->stat_flows_plimit++;
		return qdisc_drop(skb, sch);
	}

	f->qlen++;
	flow_queue_add(f, skb);
	if (skb_is_retransmit(skb))
		q->stat_tcp_retrans++;
	sch->qstats.backlog += qdisc_pkt_len(skb);
	if (fq_flow_is_detached(f)) {
		fq_flow_add_tail(&q->new_flows, f);
		if (q->quantum > f->credit)
			f->credit = q->quantum;
		q->inactive_flows--;
		qdisc_unthrottled(sch);
	}
	if (unlikely(f == &q->internal)) {
		q->stat_internal_packets++;
		qdisc_unthrottled(sch);
	}
	sch->q.qlen++;

	return NET_XMIT_SUCCESS;
}

static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
	struct rb_node *p;

	if (q->time_next_delayed_flow > now)
		return;

	q->time_next_delayed_flow = ~0ULL;
	while ((p = rb_first(&q->delayed)) != NULL) {
		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);

		if (f->time_next_packet > now) {
			q->time_next_delayed_flow = f->time_next_packet;
			break;
		}
		rb_erase(p, &q->delayed);
		q->throttled_flows--;
		fq_flow_add_tail(&q->old_flows, f);
	}
}

static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
	struct fq_sched_data *q = qdisc_priv(sch);
	u64 now = ktime_to_ns(ktime_get());
	struct fq_flow_head *head;
	struct sk_buff *skb;
	struct fq_flow *f;

	skb = fq_dequeue_head(&q->internal);
	if (skb)
		goto out;
	fq_check_throttled(q, now);
begin:
	head = &q->new_flows;
	if (!head->first) {
		head = &q->old_flows;
		if (!head->first) {
			if (q->time_next_delayed_flow != ~0ULL)
				qdisc_watchdog_schedule_ns(&q->watchdog,
							   q->time_next_delayed_flow);
			return NULL;
		}
	}
	f = head->first;

	if (f->credit <= 0) {
		f->credit += q->quantum;
		head->first = f->next;
		fq_flow_add_tail(&q->old_flows, f);
		goto begin;
	}

	if (unlikely(f->head && now < f->time_next_packet)) {
		head->first = f->next;
		fq_flow_set_throttled(q, f);
		goto begin;
	}

	skb = fq_dequeue_head(f);
	if (!skb) {
		head->first = f->next;
		/* force a pass through old_flows to prevent starvation */
		if ((head == &q->new_flows) && q->old_flows.first) {
			fq_flow_add_tail(&q->old_flows, f);
		} else {
			fq_flow_set_detached(f);
			f->age = jiffies;
			q->inactive_flows++;
		}
		goto begin;
	}
	prefetch(&skb->end);
	f->time_next_packet = now;
	f->credit -= qdisc_pkt_len(skb);

	if (f->credit <= 0 &&
	    q->rate_enable &&
	    skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
		u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;

		rate = min(rate, q->flow_max_rate);
		if (rate) {
			u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;

			do_div(len, rate);
			/* Since socket rate can change later,
			 * clamp the delay to 125 ms.
			 * TODO: maybe segment the too big skb, as in commit
			 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
			 */
			if (unlikely(len > 125 * NSEC_PER_MSEC)) {
				len = 125 * NSEC_PER_MSEC;
				q->stat_pkts_too_long++;
			}

			f->time_next_packet = now + len;
		}
	}
out:
	sch->qstats.backlog -= qdisc_pkt_len(skb);
	qdisc_bstats_update(sch, skb);
	sch->q.qlen--;
	qdisc_unthrottled(sch);
	return skb;
}

static void fq_reset(struct Qdisc *sch)
{
	struct sk_buff *skb;

	while ((skb = fq_dequeue(sch)) != NULL)
		kfree_skb(skb);
}

static void fq_rehash(struct fq_sched_data *q,
		      struct rb_root *old_array, u32 old_log,
		      struct rb_root *new_array, u32 new_log)
{
	struct rb_node *op, **np, *parent;
	struct rb_root *oroot, *nroot;
	struct fq_flow *of, *nf;
	int fcnt = 0;
	u32 idx;

	for (idx = 0; idx < (1U << old_log); idx++) {
		oroot = &old_array[idx];
		while ((op = rb_first(oroot)) != NULL) {
			rb_erase(op, oroot);
			of = container_of(op, struct fq_flow, fq_node);
			if (fq_gc_candidate(of)) {
				fcnt++;
				kmem_cache_free(fq_flow_cachep, of);
				continue;
			}
			nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];

			np = &nroot->rb_node;
			parent = NULL;
			while (*np) {
				parent = *np;

				nf = container_of(parent, struct fq_flow, fq_node);
				BUG_ON(nf->sk == of->sk);

				if (nf->sk > of->sk)
					np = &parent->rb_right;
				else
					np = &parent->rb_left;
			}

			rb_link_node(&of->fq_node, parent, np);
			rb_insert_color(&of->fq_node, nroot);
		}
	}
	q->flows -= fcnt;
	q->inactive_flows -= fcnt;
	q->stat_gc_flows += fcnt;
}

static int fq_resize(struct fq_sched_data *q, u32 log)
{
	struct rb_root *array;
	u32 idx;

	if (q->fq_root && log == q->fq_trees_log)
		return 0;

	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
	if (!array)
		return -ENOMEM;

	for (idx = 0; idx < (1U << log); idx++)
		array[idx] = RB_ROOT;

	if (q->fq_root) {
		fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
		kfree(q->fq_root);
	}
	q->fq_root = array;
	q->fq_trees_log = log;

	return 0;
}

static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 },
	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
};

static int fq_change(struct Qdisc *sch, struct nlattr *opt)
{
	struct fq_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_FQ_MAX + 1];
	int err, drop_count = 0;
	u32 fq_log;

	if (!opt)
		return -EINVAL;

	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
	if (err < 0)
		return err;

	sch_tree_lock(sch);

	fq_log = q->fq_trees_log;

	if (tb[TCA_FQ_BUCKETS_LOG]) {
		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);

		if (nval >= 1 && nval <= ilog2(256*1024))
			fq_log = nval;
		else
			err = -EINVAL;
	}
	if (tb[TCA_FQ_PLIMIT])
		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);

	if (tb[TCA_FQ_FLOW_PLIMIT])
		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);

	if (tb[TCA_FQ_QUANTUM])
		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);

	if (tb[TCA_FQ_INITIAL_QUANTUM])
		q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);

	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
		q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);

	if (tb[TCA_FQ_FLOW_MAX_RATE])
		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);

	if (tb[TCA_FQ_RATE_ENABLE]) {
		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);

		if (enable <= 1)
			q->rate_enable = enable;
		else
			err = -EINVAL;
	}

	if (!err)
		err = fq_resize(q, fq_log);

	while (sch->q.qlen > sch->limit) {
		struct sk_buff *skb = fq_dequeue(sch);

		kfree_skb(skb);
		drop_count++;
	}
	qdisc_tree_decrease_qlen(sch, drop_count);

	sch_tree_unlock(sch);
	return err;
}

static void fq_destroy(struct Qdisc *sch)
{
	struct fq_sched_data *q = qdisc_priv(sch);
	struct rb_root *root;
	struct rb_node *p;
	unsigned int idx;

	if (q->fq_root) {
		for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
			root = &q->fq_root[idx];
			while ((p = rb_first(root)) != NULL) {
				rb_erase(p, root);
				kmem_cache_free(fq_flow_cachep,
						container_of(p, struct fq_flow, fq_node));
			}
		}
		kfree(q->fq_root);
	}
	qdisc_watchdog_cancel(&q->watchdog);
}

static int fq_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct fq_sched_data *q = qdisc_priv(sch);
	int err;

	sch->limit		= 10000;
	q->flow_plimit		= 100;
	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
	q->flow_default_rate	= 0;
	q->flow_max_rate	= ~0U;
	q->rate_enable		= 1;
	q->new_flows.first	= NULL;
	q->old_flows.first	= NULL;
	q->delayed		= RB_ROOT;
	q->fq_root		= NULL;
	q->fq_trees_log		= ilog2(1024);
	qdisc_watchdog_init(&q->watchdog, sch);

	if (opt)
		err = fq_change(sch, opt);
	else
		err = fq_resize(q, q->fq_trees_log);

	return err;
}

static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct fq_sched_data *q = qdisc_priv(sch);
	struct nlattr *opts;

	opts = nla_nest_start(skb, TCA_OPTIONS);
	if (opts == NULL)
		goto nla_put_failure;

	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
	    nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) ||
	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
		goto nla_put_failure;

	nla_nest_end(skb, opts);
	return skb->len;

nla_put_failure:
	return -1;
}

static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
	struct fq_sched_data *q = qdisc_priv(sch);
	u64 now = ktime_to_ns(ktime_get());
	struct tc_fq_qd_stats st = {
		.gc_flows		= q->stat_gc_flows,
		.highprio_packets	= q->stat_internal_packets,
		.tcp_retrans		= q->stat_tcp_retrans,
		.throttled		= q->stat_throttled,
		.flows_plimit		= q->stat_flows_plimit,
		.pkts_too_long		= q->stat_pkts_too_long,
		.allocation_errors	= q->stat_allocation_errors,
		.flows			= q->flows,
		.inactive_flows		= q->inactive_flows,
		.throttled_flows	= q->throttled_flows,
		.time_next_delayed_flow	= q->time_next_delayed_flow - now,
	};

	return gnet_stats_copy_app(d, &st, sizeof(st));
}

static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
	.id		=	"fq",
	.priv_size	=	sizeof(struct fq_sched_data),

	.enqueue	=	fq_enqueue,
	.dequeue	=	fq_dequeue,
	.peek		=	qdisc_peek_dequeued,
	.init		=	fq_init,
	.reset		=	fq_reset,
	.destroy	=	fq_destroy,
	.change		=	fq_change,
	.dump		=	fq_dump,
	.dump_stats	=	fq_dump_stats,
	.owner		=	THIS_MODULE,
};

static int __init fq_module_init(void)
{
	int ret;

	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
					   sizeof(struct fq_flow),
					   0, 0, NULL);
	if (!fq_flow_cachep)
		return -ENOMEM;

	ret = register_qdisc(&fq_qdisc_ops);
	if (ret)
		kmem_cache_destroy(fq_flow_cachep);
	return ret;
}

static void __exit fq_module_exit(void)
{
	unregister_qdisc(&fq_qdisc_ops);
	kmem_cache_destroy(fq_flow_cachep);
}

module_init(fq_module_init)
module_exit(fq_module_exit)
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");
Commit	Line	Data
afe4fd06 ED	1	/*
	2	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
	3	*
	4	* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
	5	*
	6	* This program is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU General Public License
	8	* as published by the Free Software Foundation; either version
	9	* 2 of the License, or (at your option) any later version.
	10	*
	11	* Meant to be mostly used for localy generated traffic :
	12	* Fast classification depends on skb->sk being set before reaching us.
	13	* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
	14	* All packets belonging to a socket are considered as a 'flow'.
	15	*
	16	* Flows are dynamically allocated and stored in a hash table of RB trees
	17	* They are also part of one Round Robin 'queues' (new or old flows)
	18	*
	19	* Burst avoidance (aka pacing) capability :
	20	*
	21	* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
	22	* bunch of packets, and this packet scheduler adds delay between
	23	* packets to respect rate limitation.
	24	*
	25	* enqueue() :
	26	* - lookup one RB tree (out of 1024 or more) to find the flow.
	27	* If non existent flow, create it, add it to the tree.
	28	* Add skb to the per flow list of skb (fifo).
	29	* - Use a special fifo for high prio packets
	30	*
	31	* dequeue() : serves flows in Round Robin
	32	* Note : When a flow becomes empty, we do not immediately remove it from
	33	* rb trees, for performance reasons (its expected to send additional packets,
	34	* or SLAB cache will reuse socket for another flow)
	35	*/
	36
	37	#include <linux/module.h>
	38	#include <linux/types.h>
	39	#include <linux/kernel.h>
	40	#include <linux/jiffies.h>
	41	#include <linux/string.h>
	42	#include <linux/in.h>
	43	#include <linux/errno.h>
	44	#include <linux/init.h>
	45	#include <linux/skbuff.h>
	46	#include <linux/slab.h>
	47	#include <linux/rbtree.h>
	48	#include <linux/hash.h>
08f89b98	49	#include <linux/prefetch.h>
afe4fd06 ED	50	#include <net/netlink.h>
	51	#include <net/pkt_sched.h>
	52	#include <net/sock.h>
	53	#include <net/tcp_states.h>
	54
	55	/*
	56	* Per flow structure, dynamically allocated
	57	*/
	58	struct fq_flow {
	59	struct sk_buff head; / list of skbs for this flow : first skb */
	60	union {
	61	struct sk_buff tail; / last skb in the list */
	62	unsigned long age; /* jiffies when flow was emptied, for gc */
	63	};
	64	struct rb_node fq_node; /* anchor in fq_root[] trees */
	65	struct sock *sk;
	66	int qlen; /* number of packets in flow queue */
	67	int credit;
	68	u32 socket_hash; /* sk_hash */
	69	struct fq_flow next; / next pointer in RR lists, or &detached */
	70
	71	struct rb_node rate_node; /* anchor in q->delayed tree */
	72	u64 time_next_packet;
	73	};
	74
	75	struct fq_flow_head {
	76	struct fq_flow *first;
	77	struct fq_flow *last;
	78	};
	79
	80	struct fq_sched_data {
	81	struct fq_flow_head new_flows;
	82
	83	struct fq_flow_head old_flows;
	84
	85	struct rb_root delayed; /* for rate limited flows */
	86	u64 time_next_delayed_flow;
	87
	88	struct fq_flow internal; /* for non classified or high prio packets */
	89	u32 quantum;
	90	u32 initial_quantum;
	91	u32 flow_default_rate;/* rate per flow : bytes per second */
	92	u32 flow_max_rate; /* optional max rate per flow */
	93	u32 flow_plimit; /* max packets per flow */
	94	struct rb_root *fq_root;
	95	u8 rate_enable;
	96	u8 fq_trees_log;
	97
	98	u32 flows;
	99	u32 inactive_flows;
	100	u32 throttled_flows;
	101
	102	u64 stat_gc_flows;
	103	u64 stat_internal_packets;
	104	u64 stat_tcp_retrans;
	105	u64 stat_throttled;
	106	u64 stat_flows_plimit;
	107	u64 stat_pkts_too_long;
	108	u64 stat_allocation_errors;
	109	struct qdisc_watchdog watchdog;
	110	};
	111
	112	/* special value to mark a detached flow (not on old/new list) */
	113	static struct fq_flow detached, throttled;
114
115	static void fq_flow_set_detached(struct fq_flow *f)
116	{
117	f->next = &detached;
118	}
119
120	static bool fq_flow_is_detached(const struct fq_flow *f)
121	{
122	return f->next == &detached;
123	}
124
125	static void fq_flow_set_throttled(struct fq_sched_data q, struct fq_flow f)
126	{
127	struct rb_node *p = &q->delayed.rb_node, parent = NULL;
128
129	while (*p) {
130	struct fq_flow *aux;
131
132	parent = *p;
133	aux = container_of(parent, struct fq_flow, rate_node);
134	if (f->time_next_packet >= aux->time_next_packet)
135	p = &parent->rb_right;
136	else
137	p = &parent->rb_left;
138	}
139	rb_link_node(&f->rate_node, parent, p);
140	rb_insert_color(&f->rate_node, &q->delayed);
141	q->throttled_flows++;
142	q->stat_throttled++;
143
144	f->next = &throttled;
145	if (q->time_next_delayed_flow > f->time_next_packet)
146	q->time_next_delayed_flow = f->time_next_packet;
147	}
148
149
150	static struct kmem_cache *fq_flow_cachep __read_mostly;
151
152	static void fq_flow_add_tail(struct fq_flow_head head, struct fq_flow flow)
153	{
154	if (head->first)
155	head->last->next = flow;
156	else
157	head->first = flow;
158	head->last = flow;
159	flow->next = NULL;
160	}
161
162	/* limit number of collected flows per round */
163	#define FQ_GC_MAX 8
164	#define FQ_GC_AGE (3*HZ)
165
166	static bool fq_gc_candidate(const struct fq_flow *f)
167	{
168	return fq_flow_is_detached(f) &&
169	time_after(jiffies, f->age + FQ_GC_AGE);
170	}
171
172	static void fq_gc(struct fq_sched_data *q,
173	struct rb_root *root,
174	struct sock *sk)
175	{
176	struct fq_flow f, tofree[FQ_GC_MAX];
177	struct rb_node *p, parent;
178	int fcnt = 0;
179
180	p = &root->rb_node;
181	parent = NULL;
182	while (*p) {
183	parent = *p;
184
185	f = container_of(parent, struct fq_flow, fq_node);
186	if (f->sk == sk)
187	break;
188
189	if (fq_gc_candidate(f)) {
190	tofree[fcnt++] = f;
191	if (fcnt == FQ_GC_MAX)
192	break;
193	}
194
195	if (f->sk > sk)
196	p = &parent->rb_right;
197	else
198	p = &parent->rb_left;
199	}
200
201	q->flows -= fcnt;
202	q->inactive_flows -= fcnt;
203	q->stat_gc_flows += fcnt;
204	while (fcnt) {
205	struct fq_flow *f = tofree[--fcnt];
206
207	rb_erase(&f->fq_node, root);
208	kmem_cache_free(fq_flow_cachep, f);
209	}
210	}
211
212	static const u8 prio2band[TC_PRIO_MAX + 1] = {
213	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
214	};
215
216	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
217	{
218	struct rb_node *p, parent;
219	struct sock *sk = skb->sk;
220	struct rb_root *root;
221	struct fq_flow *f;
222	int band;
223
224	/* warning: no starvation prevention... */
225	band = prio2band[skb->priority & TC_PRIO_MAX];
226	if (unlikely(band == 0))
227	return &q->internal;
228
229	if (unlikely(!sk)) {
230	/* By forcing low order bit to 1, we make sure to not
231	* collide with a local flow (socket pointers are word aligned)
232	*/
233	sk = (struct sock *)(skb_get_rxhash(skb) \| 1L);
234	}
235
236	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
237
238	if (q->flows >= (2U << q->fq_trees_log) &&
239	q->inactive_flows > q->flows/2)
240	fq_gc(q, root, sk);
241
242	p = &root->rb_node;
243	parent = NULL;
244	while (*p) {
245	parent = *p;
246
247	f = container_of(parent, struct fq_flow, fq_node);
248	if (f->sk == sk) {
249	/* socket might have been reallocated, so check
250	* if its sk_hash is the same.
251	* It not, we need to refill credit with
252	* initial quantum
253	*/
254	if (unlikely(skb->sk &&
255	f->socket_hash != sk->sk_hash)) {
256	f->credit = q->initial_quantum;
257	f->socket_hash = sk->sk_hash;
258	}
259	return f;
260	}
261	if (f->sk > sk)
262	p = &parent->rb_right;
263	else
264	p = &parent->rb_left;
265	}
266
267	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC \| __GFP_NOWARN);
268	if (unlikely(!f)) {
269	q->stat_allocation_errors++;
270	return &q->internal;
271	}
272	fq_flow_set_detached(f);
273	f->sk = sk;
274	if (skb->sk)
275	f->socket_hash = sk->sk_hash;
276	f->credit = q->initial_quantum;
277
278	rb_link_node(&f->fq_node, parent, p);
279	rb_insert_color(&f->fq_node, root);
280
281	q->flows++;
282	q->inactive_flows++;
283	return f;
284	}
285
286
287	/* remove one skb from head of flow queue */
288	static struct sk_buff fq_dequeue_head(struct fq_flow flow)
289	{
290	struct sk_buff *skb = flow->head;
291
292	if (skb) {
293	flow->head = skb->next;
294	skb->next = NULL;
295	flow->qlen--;
296	}
297	return skb;
298	}
299
300	/* We might add in the future detection of retransmits
301	* For the time being, just return false
302	*/
303	static bool skb_is_retransmit(struct sk_buff *skb)
304	{
305	return false;
306	}
307
308	/* add skb to flow queue
309	* flow queue is a linked list, kind of FIFO, except for TCP retransmits
310	* We special case tcp retransmits to be transmitted before other packets.
311	* We rely on fact that TCP retransmits are unlikely, so we do not waste
312	* a separate queue or a pointer.
313	* head-> [retrans pkt 1]
314	* [retrans pkt 2]
315	* [ normal pkt 1]
316	* [ normal pkt 2]
317	* [ normal pkt 3]
318	* tail-> [ normal pkt 4]
319	*/
320	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
321	{
322	struct sk_buff prev, head = flow->head;
323
324	skb->next = NULL;
325	if (!head) {
326	flow->head = skb;
327	flow->tail = skb;
328	return;
329	}
330	if (likely(!skb_is_retransmit(skb))) {
331	flow->tail->next = skb;
332	flow->tail = skb;
333	return;
334	}
335
336	/* This skb is a tcp retransmit,
337	* find the last retrans packet in the queue
338	*/
339	prev = NULL;
340	while (skb_is_retransmit(head)) {
341	prev = head;
342	head = head->next;
343	if (!head)
344	break;
345	}
346	if (!prev) { /* no rtx packet in queue, become the new head */
347	skb->next = flow->head;
348	flow->head = skb;
349	} else {
350	if (prev == flow->tail)
351	flow->tail = skb;
352	else
353	skb->next = prev->next;
354	prev->next = skb;
355	}
356	}
357
358	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch)
359	{
360	struct fq_sched_data *q = qdisc_priv(sch);
361	struct fq_flow *f;
362
363	if (unlikely(sch->q.qlen >= sch->limit))
364	return qdisc_drop(skb, sch);
365
366	f = fq_classify(skb, q);
367	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
368	q->stat_flows_plimit++;
369	return qdisc_drop(skb, sch);
370	}
371
372	f->qlen++;
373	flow_queue_add(f, skb);
374	if (skb_is_retransmit(skb))
375	q->stat_tcp_retrans++;
376	sch->qstats.backlog += qdisc_pkt_len(skb);
377	if (fq_flow_is_detached(f)) {
378	fq_flow_add_tail(&q->new_flows, f);
379	if (q->quantum > f->credit)
380	f->credit = q->quantum;
381	q->inactive_flows--;
382	qdisc_unthrottled(sch);
383	}
384	if (unlikely(f == &q->internal)) {
385	q->stat_internal_packets++;
386	qdisc_unthrottled(sch);
387	}
388	sch->q.qlen++;
389
390	return NET_XMIT_SUCCESS;
391	}
392
393	static void fq_check_throttled(struct fq_sched_data *q, u64 now)
394	{
395	struct rb_node *p;
396
397	if (q->time_next_delayed_flow > now)
398	return;
399
400	q->time_next_delayed_flow = ~0ULL;
401	while ((p = rb_first(&q->delayed)) != NULL) {
402	struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
403
404	if (f->time_next_packet > now) {
405	q->time_next_delayed_flow = f->time_next_packet;
406	break;
407	}
408	rb_erase(p, &q->delayed);
409	q->throttled_flows--;
410	fq_flow_add_tail(&q->old_flows, f);
411	}
412	}
413
414	static struct sk_buff fq_dequeue(struct Qdisc sch)
415	{
416	struct fq_sched_data *q = qdisc_priv(sch);
417	u64 now = ktime_to_ns(ktime_get());
418	struct fq_flow_head *head;
419	struct sk_buff *skb;
420	struct fq_flow *f;
421
422	skb = fq_dequeue_head(&q->internal);
423	if (skb)
424	goto out;
425	fq_check_throttled(q, now);
426	begin:
427	head = &q->new_flows;
428	if (!head->first) {
429	head = &q->old_flows;
430	if (!head->first) {
431	if (q->time_next_delayed_flow != ~0ULL)
432	qdisc_watchdog_schedule_ns(&q->watchdog,
433	q->time_next_delayed_flow);
434	return NULL;
435	}
436	}
437	f = head->first;
438
439	if (f->credit <= 0) {
440	f->credit += q->quantum;
441	head->first = f->next;
442	fq_flow_add_tail(&q->old_flows, f);
443	goto begin;
444	}
445
446	if (unlikely(f->head && now < f->time_next_packet)) {
447	head->first = f->next;
448	fq_flow_set_throttled(q, f);
449	goto begin;
450	}
451
452	skb = fq_dequeue_head(f);
453	if (!skb) {
454	head->first = f->next;
455	/* force a pass through old_flows to prevent starvation */
456	if ((head == &q->new_flows) && q->old_flows.first) {
457	fq_flow_add_tail(&q->old_flows, f);
458	} else {
459	fq_flow_set_detached(f);
460	f->age = jiffies;
461	q->inactive_flows++;
462	}
463	goto begin;
464	}
08f89b98	465	prefetch(&skb->end);
afe4fd06 ED	466	f->time_next_packet = now;
	467	f->credit -= qdisc_pkt_len(skb);
	468
	469	if (f->credit <= 0 &&
	470	q->rate_enable &&
	471	skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
	472	u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
	473
	474	rate = min(rate, q->flow_max_rate);
	475	if (rate) {
	476	u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
	477
	478	do_div(len, rate);
	479	/* Since socket rate can change later,
	480	* clamp the delay to 125 ms.
	481	* TODO: maybe segment the too big skb, as in commit
	482	* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
	483	*/
	484	if (unlikely(len > 125 * NSEC_PER_MSEC)) {
	485	len = 125 * NSEC_PER_MSEC;
	486	q->stat_pkts_too_long++;
	487	}
	488
	489	f->time_next_packet = now + len;
	490	}
	491	}
	492	out:
afe4fd06 ED	493	sch->qstats.backlog -= qdisc_pkt_len(skb);
	494	qdisc_bstats_update(sch, skb);
	495	sch->q.qlen--;
	496	qdisc_unthrottled(sch);
	497	return skb;
	498	}
	499
	500	static void fq_reset(struct Qdisc *sch)
	501	{
	502	struct sk_buff *skb;
	503
	504	while ((skb = fq_dequeue(sch)) != NULL)
	505	kfree_skb(skb);
	506	}
	507
	508	static void fq_rehash(struct fq_sched_data *q,
	509	struct rb_root *old_array, u32 old_log,
	510	struct rb_root *new_array, u32 new_log)
	511	{
	512	struct rb_node op, np, parent;
	513	struct rb_root oroot, nroot;
	514	struct fq_flow of, nf;
	515	int fcnt = 0;
	516	u32 idx;
	517
	518	for (idx = 0; idx < (1U << old_log); idx++) {
	519	oroot = &old_array[idx];
	520	while ((op = rb_first(oroot)) != NULL) {
	521	rb_erase(op, oroot);
	522	of = container_of(op, struct fq_flow, fq_node);
	523	if (fq_gc_candidate(of)) {
	524	fcnt++;
	525	kmem_cache_free(fq_flow_cachep, of);
	526	continue;
	527	}
	528	nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
	529
	530	np = &nroot->rb_node;
	531	parent = NULL;
	532	while (*np) {
	533	parent = *np;
	534
	535	nf = container_of(parent, struct fq_flow, fq_node);
	536	BUG_ON(nf->sk == of->sk);
	537
	538	if (nf->sk > of->sk)
	539	np = &parent->rb_right;
	540	else
	541	np = &parent->rb_left;
	542	}
	543
	544	rb_link_node(&of->fq_node, parent, np);
	545	rb_insert_color(&of->fq_node, nroot);
	546	}
	547	}
	548	q->flows -= fcnt;
	549	q->inactive_flows -= fcnt;
	550	q->stat_gc_flows += fcnt;
	551	}
	552
	553	static int fq_resize(struct fq_sched_data *q, u32 log)
	554	{
	555	struct rb_root *array;
	556	u32 idx;
557
558	if (q->fq_root && log == q->fq_trees_log)
559	return 0;
560
561	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
562	if (!array)
563	return -ENOMEM;
564
565	for (idx = 0; idx < (1U << log); idx++)
566	array[idx] = RB_ROOT;
567
568	if (q->fq_root) {
569	fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
570	kfree(q->fq_root);
571	}
572	q->fq_root = array;
573	q->fq_trees_log = log;
574
575	return 0;
576	}
577
578	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
579	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
580	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
581	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
582	[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
583	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
584	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
585	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
586	[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
587	};
588
589	static int fq_change(struct Qdisc sch, struct nlattr opt)
590	{
591	struct fq_sched_data *q = qdisc_priv(sch);
592	struct nlattr *tb[TCA_FQ_MAX + 1];
593	int err, drop_count = 0;
594	u32 fq_log;
595
596	if (!opt)
597	return -EINVAL;
598
599	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
600	if (err < 0)
601	return err;
602
603	sch_tree_lock(sch);
604
605	fq_log = q->fq_trees_log;
606
607	if (tb[TCA_FQ_BUCKETS_LOG]) {
608	u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
609
610	if (nval >= 1 && nval <= ilog2(256*1024))
611	fq_log = nval;
612	else
613	err = -EINVAL;
614	}
615	if (tb[TCA_FQ_PLIMIT])
616	sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
617
618	if (tb[TCA_FQ_FLOW_PLIMIT])
619	q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
620
621	if (tb[TCA_FQ_QUANTUM])
622	q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
623
624	if (tb[TCA_FQ_INITIAL_QUANTUM])
625	q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
626
627	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
628	q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
629
630	if (tb[TCA_FQ_FLOW_MAX_RATE])
631	q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
632
633	if (tb[TCA_FQ_RATE_ENABLE]) {
634	u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
635
636	if (enable <= 1)
637	q->rate_enable = enable;
638	else
639	err = -EINVAL;
640	}
641
642	if (!err)
643	err = fq_resize(q, fq_log);
644
645	while (sch->q.qlen > sch->limit) {
646	struct sk_buff *skb = fq_dequeue(sch);
647
648	kfree_skb(skb);
649	drop_count++;
650	}
651	qdisc_tree_decrease_qlen(sch, drop_count);
652
653	sch_tree_unlock(sch);
654	return err;
655	}
656
657	static void fq_destroy(struct Qdisc *sch)
658	{
659	struct fq_sched_data *q = qdisc_priv(sch);
660	struct rb_root *root;
661	struct rb_node *p;
662	unsigned int idx;
663
664	if (q->fq_root) {
665	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
666	root = &q->fq_root[idx];
667	while ((p = rb_first(root)) != NULL) {
668	rb_erase(p, root);
669	kmem_cache_free(fq_flow_cachep,
670	container_of(p, struct fq_flow, fq_node));
671	}
672	}
673	kfree(q->fq_root);
674	}
675	qdisc_watchdog_cancel(&q->watchdog);
676	}
677
678	static int fq_init(struct Qdisc sch, struct nlattr opt)
679	{
680	struct fq_sched_data *q = qdisc_priv(sch);
681	int err;
682
683	sch->limit = 10000;
684	q->flow_plimit = 100;
685	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
686	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
687	q->flow_default_rate = 0;
688	q->flow_max_rate = ~0U;
689	q->rate_enable = 1;
690	q->new_flows.first = NULL;
691	q->old_flows.first = NULL;
692	q->delayed = RB_ROOT;
693	q->fq_root = NULL;
694	q->fq_trees_log = ilog2(1024);
695	qdisc_watchdog_init(&q->watchdog, sch);
696
697	if (opt)
698	err = fq_change(sch, opt);
699	else
700	err = fq_resize(q, q->fq_trees_log);
701
702	return err;
703	}
704
705	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
706	{
707	struct fq_sched_data *q = qdisc_priv(sch);
708	struct nlattr *opts;
709
710	opts = nla_nest_start(skb, TCA_OPTIONS);
711	if (opts == NULL)
712	goto nla_put_failure;
713
714	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
715	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
716	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
717	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
718	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
719	nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) \|\|
720	nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) \|\|
721	nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
722	goto nla_put_failure;
723
724	nla_nest_end(skb, opts);
725	return skb->len;
726
727	nla_put_failure:
728	return -1;
729	}
730
731	static int fq_dump_stats(struct Qdisc sch, struct gnet_dump d)
732	{
733	struct fq_sched_data *q = qdisc_priv(sch);
734	u64 now = ktime_to_ns(ktime_get());
735	struct tc_fq_qd_stats st = {
736	.gc_flows = q->stat_gc_flows,
737	.highprio_packets = q->stat_internal_packets,
738	.tcp_retrans = q->stat_tcp_retrans,
739	.throttled = q->stat_throttled,
740	.flows_plimit = q->stat_flows_plimit,
741	.pkts_too_long = q->stat_pkts_too_long,
742	.allocation_errors = q->stat_allocation_errors,
743	.flows = q->flows,
744	.inactive_flows = q->inactive_flows,
745	.throttled_flows = q->throttled_flows,
746	.time_next_delayed_flow = q->time_next_delayed_flow - now,
747	};
748
749	return gnet_stats_copy_app(d, &st, sizeof(st));
750	}
751
752	static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
753	.id = "fq",
754	.priv_size = sizeof(struct fq_sched_data),
755
756	.enqueue = fq_enqueue,
757	.dequeue = fq_dequeue,
758	.peek = qdisc_peek_dequeued,
759	.init = fq_init,
760	.reset = fq_reset,
761	.destroy = fq_destroy,
762	.change = fq_change,
763	.dump = fq_dump,
764	.dump_stats = fq_dump_stats,
765	.owner = THIS_MODULE,
766	};
767
768	static int __init fq_module_init(void)
769	{
770	int ret;
771
772	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
773	sizeof(struct fq_flow),
774	0, 0, NULL);
775	if (!fq_flow_cachep)
776	return -ENOMEM;
777
778	ret = register_qdisc(&fq_qdisc_ops);
779	if (ret)
780	kmem_cache_destroy(fq_flow_cachep);
781	return ret;
782	}
783
784	static void __exit fq_module_exit(void)
785	{
786	unregister_qdisc(&fq_qdisc_ops);
787	kmem_cache_destroy(fq_flow_cachep);
788	}
789
790	module_init(fq_module_init)
791	module_exit(fq_module_exit)
792	MODULE_AUTHOR("Eric Dumazet");
793	MODULE_LICENSE("GPL");