[mirror_ubuntu-bionic-kernel.git] / net / sched / sch_tbf.c

/*
 * net/sched/sch_tbf.c	Token Bucket Filter queue.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
 *						 original idea by Martin Devera
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>


/*	Simple Token Bucket Filter.
	=======================================

	SOURCE.
	-------

	None.

	Description.
	------------

	A data flow obeys TBF with rate R and depth B, if for any
	time interval t_i...t_f the number of transmitted bits
	does not exceed B + R*(t_f-t_i).

	Packetized version of this definition:
	The sequence of packets of sizes s_i served at moments t_i
	obeys TBF, if for any i<=k:

	s_i+....+s_k <= B + R*(t_k - t_i)

	Algorithm.
	----------

	Let N(t_i) be B/R initially and N(t) grow continuously with time as:

	N(t+delta) = min{B/R, N(t) + delta}

	If the first packet in queue has length S, it may be
	transmitted only at the time t_* when S/R <= N(t_*),
	and in this case N(t) jumps:

	N(t_* + 0) = N(t_* - 0) - S/R.


	Actually, QoS requires two TBF to be applied to a data stream.
	One of them controls steady state burst size, another
	one with rate P (peak rate) and depth M (equal to link MTU)
	limits bursts at a smaller time scale.

	It is easy to see that P>R, and B>M. If P is infinity, this double
	TBF is equivalent to a single one.

	When TBF works in reshaping mode, latency is estimated as:

	lat = max ((L-B)/R, (L-M)/P)


	NOTES.
	------

	If TBF throttles, it starts a watchdog timer, which will wake it up
	when it is ready to transmit.
	Note that the minimal timer resolution is 1/HZ.
	If no new packets arrive during this period,
	or if the device is not awaken by EOI for some previous packet,
	TBF can stop its activity for 1/HZ.


	This means, that with depth B, the maximal rate is

	R_crit = B*HZ

	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.

	Note that the peak rate TBF is much more tough: with MTU 1500
	P_crit = 150Kbytes/sec. So, if you need greater peak
	rates, use alpha with HZ=1000 :-)

	With classful TBF, limit is just kept for backwards compatibility.
	It is passed to the default bfifo qdisc - if the inner qdisc is
	changed the limit is not effective anymore.
*/

struct tbf_sched_data {
/* Parameters */
	u32		limit;		/* Maximal length of backlog: bytes */
	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
	s64		mtu;
	u32		max_size;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;
	bool peak_present;

/* Variables */
	s64	tokens;			/* Current number of B tokens */
	s64	ptokens;		/* Current number of P tokens */
	s64	t_c;			/* Time check-point */
	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
	struct qdisc_watchdog watchdog;	/* Watchdog timer */
};


/* GSO packet is too big, segment it so that tbf can transmit
 * each segment in time
 */
static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *segs, *nskb;
	netdev_features_t features = netif_skb_features(skb);
	int ret, nb;

	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);

	if (IS_ERR_OR_NULL(segs))
		return qdisc_reshape_fail(skb, sch);

	nb = 0;
	while (segs) {
		nskb = segs->next;
		segs->next = NULL;
		if (likely(segs->len <= q->max_size)) {
			qdisc_skb_cb(segs)->pkt_len = segs->len;
			ret = qdisc_enqueue(segs, q->qdisc);
		} else {
			ret = qdisc_reshape_fail(skb, sch);
		}
		if (ret != NET_XMIT_SUCCESS) {
			if (net_xmit_drop_count(ret))
				sch->qstats.drops++;
		} else {
			nb++;
		}
		segs = nskb;
	}
	sch->q.qlen += nb;
	if (nb > 1)
		qdisc_tree_decrease_qlen(sch, 1 - nb);
	consume_skb(skb);
	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}

static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	int ret;

	if (qdisc_pkt_len(skb) > q->max_size) {
		if (skb_is_gso(skb))
			return tbf_segment(skb, sch);
		return qdisc_reshape_fail(skb, sch);
	}
	ret = qdisc_enqueue(skb, q->qdisc);
	if (ret != NET_XMIT_SUCCESS) {
		if (net_xmit_drop_count(ret))
			sch->qstats.drops++;
		return ret;
	}

	sch->q.qlen++;
	return NET_XMIT_SUCCESS;
}

static unsigned int tbf_drop(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	unsigned int len = 0;

	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
		sch->q.qlen--;
		sch->qstats.drops++;
	}
	return len;
}

static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;

	skb = q->qdisc->ops->peek(q->qdisc);

	if (skb) {
		s64 now;
		s64 toks;
		s64 ptoks = 0;
		unsigned int len = qdisc_pkt_len(skb);

		now = ktime_to_ns(ktime_get());
		toks = min_t(s64, now - q->t_c, q->buffer);

		if (q->peak_present) {
			ptoks = toks + q->ptokens;
			if (ptoks > q->mtu)
				ptoks = q->mtu;
			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
		}
		toks += q->tokens;
		if (toks > q->buffer)
			toks = q->buffer;
		toks -= (s64) psched_l2t_ns(&q->rate, len);

		if ((toks|ptoks) >= 0) {
			skb = qdisc_dequeue_peeked(q->qdisc);
			if (unlikely(!skb))
				return NULL;

			q->t_c = now;
			q->tokens = toks;
			q->ptokens = ptoks;
			sch->q.qlen--;
			qdisc_unthrottled(sch);
			qdisc_bstats_update(sch, skb);
			return skb;
		}

		qdisc_watchdog_schedule_ns(&q->watchdog,
					   now + max_t(long, -toks, -ptoks));

		/* Maybe we have a shorter packet in the queue,
		   which can be sent now. It sounds cool,
		   but, however, this is wrong in principle.
		   We MUST NOT reorder packets under these circumstances.

		   Really, if we split the flow into independent
		   subflows, it would be a very good solution.
		   This is the main idea of all FQ algorithms
		   (cf. CSZ, HPFQ, HFSC)
		 */

		sch->qstats.overlimits++;
	}
	return NULL;
}

static void tbf_reset(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_reset(q->qdisc);
	sch->q.qlen = 0;
	q->t_c = ktime_to_ns(ktime_get());
	q->tokens = q->buffer;
	q->ptokens = q->mtu;
	qdisc_watchdog_cancel(&q->watchdog);
}

static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
};

static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
	int err;
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_TBF_MAX + 1];
	struct tc_tbf_qopt *qopt;
	struct qdisc_rate_table *rtab = NULL;
	struct qdisc_rate_table *ptab = NULL;
	struct Qdisc *child = NULL;
	int max_size, n;
	u64 rate64 = 0, prate64 = 0;

	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
	if (err < 0)
		return err;

	err = -EINVAL;
	if (tb[TCA_TBF_PARMS] == NULL)
		goto done;

	qopt = nla_data(tb[TCA_TBF_PARMS]);
	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
	if (rtab == NULL)
		goto done;

	if (qopt->peakrate.rate) {
		if (qopt->peakrate.rate > qopt->rate.rate)
			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
		if (ptab == NULL)
			goto done;
	}

	for (n = 0; n < 256; n++)
		if (rtab->data[n] > qopt->buffer)
			break;
	max_size = (n << qopt->rate.cell_log) - 1;
	if (ptab) {
		int size;

		for (n = 0; n < 256; n++)
			if (ptab->data[n] > qopt->mtu)
				break;
		size = (n << qopt->peakrate.cell_log) - 1;
		if (size < max_size)
			max_size = size;
	}
	if (max_size < 0)
		goto done;

	if (q->qdisc != &noop_qdisc) {
		err = fifo_set_limit(q->qdisc, qopt->limit);
		if (err)
			goto done;
	} else if (qopt->limit > 0) {
		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
		if (IS_ERR(child)) {
			err = PTR_ERR(child);
			goto done;
		}
	}

	sch_tree_lock(sch);
	if (child) {
		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
		qdisc_destroy(q->qdisc);
		q->qdisc = child;
	}
	q->limit = qopt->limit;
	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
	q->max_size = max_size;
	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
	q->tokens = q->buffer;
	q->ptokens = q->mtu;

	if (tb[TCA_TBF_RATE64])
		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
	if (ptab) {
		if (tb[TCA_TBF_PRATE64])
			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
		psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
		q->peak_present = true;
	} else {
		q->peak_present = false;
	}

	sch_tree_unlock(sch);
	err = 0;
done:
	if (rtab)
		qdisc_put_rtab(rtab);
	if (ptab)
		qdisc_put_rtab(ptab);
	return err;
}

static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (opt == NULL)
		return -EINVAL;

	q->t_c = ktime_to_ns(ktime_get());
	qdisc_watchdog_init(&q->watchdog, sch);
	q->qdisc = &noop_qdisc;

	return tbf_change(sch, opt);
}

static void tbf_destroy(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_watchdog_cancel(&q->watchdog);
	qdisc_destroy(q->qdisc);
}

static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *nest;
	struct tc_tbf_qopt opt;

	sch->qstats.backlog = q->qdisc->qstats.backlog;
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;

	opt.limit = q->limit;
	psched_ratecfg_getrate(&opt.rate, &q->rate);
	if (q->peak_present)
		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
	else
		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
	opt.mtu = PSCHED_NS2TICKS(q->mtu);
	opt.buffer = PSCHED_NS2TICKS(q->buffer);
	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
		goto nla_put_failure;
	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
		goto nla_put_failure;
	if (q->peak_present &&
	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
		goto nla_put_failure;

	nla_nest_end(skb, nest);
	return skb->len;

nla_put_failure:
	nla_nest_cancel(skb, nest);
	return -1;
}

static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
			  struct sk_buff *skb, struct tcmsg *tcm)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	tcm->tcm_handle |= TC_H_MIN(1);
	tcm->tcm_info = q->qdisc->handle;

	return 0;
}

static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
		     struct Qdisc **old)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (new == NULL)
		new = &noop_qdisc;

	sch_tree_lock(sch);
	*old = q->qdisc;
	q->qdisc = new;
	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
	qdisc_reset(*old);
	sch_tree_unlock(sch);

	return 0;
}

static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	return q->qdisc;
}

static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
{
	return 1;
}

static void tbf_put(struct Qdisc *sch, unsigned long arg)
{
}

static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
	if (!walker->stop) {
		if (walker->count >= walker->skip)
			if (walker->fn(sch, 1, walker) < 0) {
				walker->stop = 1;
				return;
			}
		walker->count++;
	}
}

static const struct Qdisc_class_ops tbf_class_ops = {
	.graft		=	tbf_graft,
	.leaf		=	tbf_leaf,
	.get		=	tbf_get,
	.put		=	tbf_put,
	.walk		=	tbf_walk,
	.dump		=	tbf_dump_class,
};

static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
	.next		=	NULL,
	.cl_ops		=	&tbf_class_ops,
	.id		=	"tbf",
	.priv_size	=	sizeof(struct tbf_sched_data),
	.enqueue	=	tbf_enqueue,
	.dequeue	=	tbf_dequeue,
	.peek		=	qdisc_peek_dequeued,
	.drop		=	tbf_drop,
	.init		=	tbf_init,
	.reset		=	tbf_reset,
	.destroy	=	tbf_destroy,
	.change		=	tbf_change,
	.dump		=	tbf_dump,
	.owner		=	THIS_MODULE,
};

static int __init tbf_module_init(void)
{
	return register_qdisc(&tbf_qdisc_ops);
}

static void __exit tbf_module_exit(void)
{
	unregister_qdisc(&tbf_qdisc_ops);
}
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");
Commit	Line	Data
1da177e4 LT	1	/*
	2	* net/sched/sch_tbf.c Token Bucket Filter queue.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
	11	* original idea by Martin Devera
	12	*
	13	*/
	14
1da177e4	15	#include <linux/module.h>
1da177e4 LT	16	#include <linux/types.h>
1da177e4 LT	17	#include <linux/kernel.h>
1da177e4	18	#include <linux/string.h>
1da177e4	19	#include <linux/errno.h>
1da177e4	20	#include <linux/skbuff.h>
0ba48053	21	#include <net/netlink.h>
b757c933	22	#include <net/sch_generic.h>
1da177e4 LT	23	#include <net/pkt_sched.h>
	24
	25
	26	/* Simple Token Bucket Filter.
	27	=======================================
	28
	29	SOURCE.
	30	-------
	31
	32	None.
	33
	34	Description.
	35	------------
	36
	37	A data flow obeys TBF with rate R and depth B, if for any
	38	time interval t_i...t_f the number of transmitted bits
	39	does not exceed B + R*(t_f-t_i).
	40
	41	Packetized version of this definition:
	42	The sequence of packets of sizes s_i served at moments t_i
	43	obeys TBF, if for any i<=k:
	44
	45	s_i+....+s_k <= B + R*(t_k - t_i)
	46
	47	Algorithm.
	48	----------
	49
	50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
	51
	52	N(t+delta) = min{B/R, N(t) + delta}
	53
	54	If the first packet in queue has length S, it may be
	55	transmitted only at the time t_* when S/R <= N(t_*),
	56	and in this case N(t) jumps:
	57
	58	N(t_* + 0) = N(t_* - 0) - S/R.
	59
	60
	61
	62	Actually, QoS requires two TBF to be applied to a data stream.
	63	One of them controls steady state burst size, another
	64	one with rate P (peak rate) and depth M (equal to link MTU)
	65	limits bursts at a smaller time scale.
	66
	67	It is easy to see that P>R, and B>M. If P is infinity, this double
	68	TBF is equivalent to a single one.
	69
	70	When TBF works in reshaping mode, latency is estimated as:
	71
	72	lat = max ((L-B)/R, (L-M)/P)
	73
	74
	75	NOTES.
	76	------
	77
	78	If TBF throttles, it starts a watchdog timer, which will wake it up
	79	when it is ready to transmit.
	80	Note that the minimal timer resolution is 1/HZ.
	81	If no new packets arrive during this period,
	82	or if the device is not awaken by EOI for some previous packet,
	83	TBF can stop its activity for 1/HZ.
	84
	85
	86	This means, that with depth B, the maximal rate is
87
88	R_crit = B*HZ
89
90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92	Note that the peak rate TBF is much more tough: with MTU 1500
93	P_crit = 150Kbytes/sec. So, if you need greater peak
94	rates, use alpha with HZ=1000 :-)
95
96	With classful TBF, limit is just kept for backwards compatibility.
97	It is passed to the default bfifo qdisc - if the inner qdisc is
98	changed the limit is not effective anymore.
99	*/
100
cc7ec456	101	struct tbf_sched_data {
1da177e4 LT	102	/* Parameters */
1da177e4 LT	103	u32 limit; /* Maximal length of backlog: bytes */
b757c933 JP	104	s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
b757c933 JP	105	s64 mtu;
1da177e4	106	u32 max_size;
b757c933 JP	107	struct psched_ratecfg rate;
	108	struct psched_ratecfg peak;
	109	bool peak_present;
1da177e4 LT	110
1da177e4 LT	111	/* Variables */
b757c933 JP	112	s64 tokens; /* Current number of B tokens */
	113	s64 ptokens; /* Current number of P tokens */
	114	s64 t_c; /* Time check-point */
1da177e4	115	struct Qdisc qdisc; / Inner qdisc, default - bfifo queue */
f7f593e3	116	struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4 LT	117	};
1da177e4 LT	118
e43ac79a ED	119
	120	/* GSO packet is too big, segment it so that tbf can transmit
	121	* each segment in time
	122	*/
	123	static int tbf_segment(struct sk_buff skb, struct Qdisc sch)
	124	{
	125	struct tbf_sched_data *q = qdisc_priv(sch);
	126	struct sk_buff segs, nskb;
	127	netdev_features_t features = netif_skb_features(skb);
	128	int ret, nb;
	129
	130	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
	131
	132	if (IS_ERR_OR_NULL(segs))
	133	return qdisc_reshape_fail(skb, sch);
	134
	135	nb = 0;
	136	while (segs) {
	137	nskb = segs->next;
	138	segs->next = NULL;
	139	if (likely(segs->len <= q->max_size)) {
	140	qdisc_skb_cb(segs)->pkt_len = segs->len;
	141	ret = qdisc_enqueue(segs, q->qdisc);
	142	} else {
	143	ret = qdisc_reshape_fail(skb, sch);
	144	}
	145	if (ret != NET_XMIT_SUCCESS) {
	146	if (net_xmit_drop_count(ret))
	147	sch->qstats.drops++;
	148	} else {
	149	nb++;
	150	}
	151	segs = nskb;
	152	}
	153	sch->q.qlen += nb;
	154	if (nb > 1)
	155	qdisc_tree_decrease_qlen(sch, 1 - nb);
	156	consume_skb(skb);
	157	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
	158	}
	159
cc7ec456	160	static int tbf_enqueue(struct sk_buff skb, struct Qdisc sch)
1da177e4 LT	161	{
	162	struct tbf_sched_data *q = qdisc_priv(sch);
	163	int ret;
	164
e43ac79a ED	165	if (qdisc_pkt_len(skb) > q->max_size) {
	166	if (skb_is_gso(skb))
	167	return tbf_segment(skb, sch);
69747650	168	return qdisc_reshape_fail(skb, sch);
e43ac79a	169	}
5f86173b	170	ret = qdisc_enqueue(skb, q->qdisc);
9871e50e	171	if (ret != NET_XMIT_SUCCESS) {
378a2f09 JP	172	if (net_xmit_drop_count(ret))
378a2f09 JP	173	sch->qstats.drops++;
1da177e4 LT	174	return ret;
	175	}
	176
	177	sch->q.qlen++;
9871e50e	178	return NET_XMIT_SUCCESS;
1da177e4 LT	179	}
1da177e4 LT	180
cc7ec456	181	static unsigned int tbf_drop(struct Qdisc *sch)
1da177e4 LT	182	{
1da177e4 LT	183	struct tbf_sched_data *q = qdisc_priv(sch);
6d037a26	184	unsigned int len = 0;
1da177e4	185
6d037a26	186	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
1da177e4 LT	187	sch->q.qlen--;
	188	sch->qstats.drops++;
	189	}
	190	return len;
	191	}
	192
cc7ec456	193	static struct sk_buff tbf_dequeue(struct Qdisc sch)
1da177e4 LT	194	{
	195	struct tbf_sched_data *q = qdisc_priv(sch);
	196	struct sk_buff *skb;
	197
03c05f0d	198	skb = q->qdisc->ops->peek(q->qdisc);
1da177e4 LT	199
1da177e4 LT	200	if (skb) {
b757c933 JP	201	s64 now;
	202	s64 toks;
	203	s64 ptoks = 0;
0abf77e5	204	unsigned int len = qdisc_pkt_len(skb);
1da177e4	205
b757c933 JP	206	now = ktime_to_ns(ktime_get());
b757c933 JP	207	toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4	208
b757c933	209	if (q->peak_present) {
1da177e4	210	ptoks = toks + q->ptokens;
b757c933	211	if (ptoks > q->mtu)
1da177e4	212	ptoks = q->mtu;
b757c933	213	ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4 LT	214	}
1da177e4 LT	215	toks += q->tokens;
b757c933	216	if (toks > q->buffer)
1da177e4	217	toks = q->buffer;
b757c933	218	toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4 LT	219
1da177e4 LT	220	if ((toks\|ptoks) >= 0) {
77be155c	221	skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d JP	222	if (unlikely(!skb))
	223	return NULL;
	224
1da177e4 LT	225	q->t_c = now;
	226	q->tokens = toks;
	227	q->ptokens = ptoks;
	228	sch->q.qlen--;
fd245a4a	229	qdisc_unthrottled(sch);
9190b3b3	230	qdisc_bstats_update(sch, skb);
1da177e4 LT	231	return skb;
	232	}
	233
b757c933 JP	234	qdisc_watchdog_schedule_ns(&q->watchdog,
b757c933 JP	235	now + max_t(long, -toks, -ptoks));
1da177e4 LT	236
	237	/* Maybe we have a shorter packet in the queue,
	238	which can be sent now. It sounds cool,
	239	but, however, this is wrong in principle.
	240	We MUST NOT reorder packets under these circumstances.
	241
	242	Really, if we split the flow into independent
	243	subflows, it would be a very good solution.
	244	This is the main idea of all FQ algorithms
	245	(cf. CSZ, HPFQ, HFSC)
	246	*/
	247
1da177e4 LT	248	sch->qstats.overlimits++;
	249	}
	250	return NULL;
	251	}
	252
cc7ec456	253	static void tbf_reset(struct Qdisc *sch)
1da177e4 LT	254	{
	255	struct tbf_sched_data *q = qdisc_priv(sch);
	256
	257	qdisc_reset(q->qdisc);
	258	sch->q.qlen = 0;
b757c933	259	q->t_c = ktime_to_ns(ktime_get());
1da177e4 LT	260	q->tokens = q->buffer;
1da177e4 LT	261	q->ptokens = q->mtu;
f7f593e3	262	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	263	}
1da177e4 LT	264
27a3421e PM	265	static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	266	[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
	267	[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	268	[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
a33c4a26 YY	269	[TCA_TBF_RATE64] = { .type = NLA_U64 },
a33c4a26 YY	270	[TCA_TBF_PRATE64] = { .type = NLA_U64 },
27a3421e PM	271	};
27a3421e PM	272
cc7ec456	273	static int tbf_change(struct Qdisc sch, struct nlattr opt)
1da177e4	274	{
cee63723	275	int err;
1da177e4	276	struct tbf_sched_data *q = qdisc_priv(sch);
a33c4a26	277	struct nlattr *tb[TCA_TBF_MAX + 1];
1da177e4 LT	278	struct tc_tbf_qopt *qopt;
	279	struct qdisc_rate_table *rtab = NULL;
	280	struct qdisc_rate_table *ptab = NULL;
	281	struct Qdisc *child = NULL;
cc7ec456	282	int max_size, n;
a33c4a26	283	u64 rate64 = 0, prate64 = 0;
1da177e4	284
a33c4a26	285	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
cee63723 PM	286	if (err < 0)
	287	return err;
	288
	289	err = -EINVAL;
27a3421e	290	if (tb[TCA_TBF_PARMS] == NULL)
1da177e4 LT	291	goto done;
1da177e4 LT	292
1e90474c PM	293	qopt = nla_data(tb[TCA_TBF_PARMS]);
1e90474c PM	294	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
1da177e4 LT	295	if (rtab == NULL)
	296	goto done;
	297
	298	if (qopt->peakrate.rate) {
	299	if (qopt->peakrate.rate > qopt->rate.rate)
1e90474c	300	ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
1da177e4 LT	301	if (ptab == NULL)
	302	goto done;
	303	}
	304
	305	for (n = 0; n < 256; n++)
cc7ec456 ED	306	if (rtab->data[n] > qopt->buffer)
	307	break;
	308	max_size = (n << qopt->rate.cell_log) - 1;
1da177e4 LT	309	if (ptab) {
	310	int size;
	311
	312	for (n = 0; n < 256; n++)
cc7ec456 ED	313	if (ptab->data[n] > qopt->mtu)
	314	break;
	315	size = (n << qopt->peakrate.cell_log) - 1;
	316	if (size < max_size)
	317	max_size = size;
1da177e4 LT	318	}
	319	if (max_size < 0)
	320	goto done;
	321
f0cd1508	322	if (q->qdisc != &noop_qdisc) {
	323	err = fifo_set_limit(q->qdisc, qopt->limit);
	324	if (err)
	325	goto done;
	326	} else if (qopt->limit > 0) {
fb0305ce PM	327	child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
	328	if (IS_ERR(child)) {
	329	err = PTR_ERR(child);
1da177e4	330	goto done;
fb0305ce	331	}
1da177e4 LT	332	}
	333
	334	sch_tree_lock(sch);
5e50da01 PM	335	if (child) {
5e50da01 PM	336	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
b94c8afc PM	337	qdisc_destroy(q->qdisc);
b94c8afc PM	338	q->qdisc = child;
5e50da01	339	}
1da177e4	340	q->limit = qopt->limit;
b757c933	341	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4	342	q->max_size = max_size;
b757c933	343	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4 LT	344	q->tokens = q->buffer;
1da177e4 LT	345	q->ptokens = q->mtu;
b94c8afc	346
a33c4a26 YY	347	if (tb[TCA_TBF_RATE64])
	348	rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	349	psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
b757c933	350	if (ptab) {
a33c4a26 YY	351	if (tb[TCA_TBF_PRATE64])
	352	prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
	353	psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
b757c933 JP	354	q->peak_present = true;
	355	} else {
	356	q->peak_present = false;
	357	}
b94c8afc	358
1da177e4 LT	359	sch_tree_unlock(sch);
	360	err = 0;
	361	done:
	362	if (rtab)
	363	qdisc_put_rtab(rtab);
	364	if (ptab)
	365	qdisc_put_rtab(ptab);
	366	return err;
	367	}
	368
cc7ec456	369	static int tbf_init(struct Qdisc sch, struct nlattr opt)
1da177e4 LT	370	{
	371	struct tbf_sched_data *q = qdisc_priv(sch);
	372
	373	if (opt == NULL)
	374	return -EINVAL;
	375
b757c933	376	q->t_c = ktime_to_ns(ktime_get());
f7f593e3	377	qdisc_watchdog_init(&q->watchdog, sch);
1da177e4 LT	378	q->qdisc = &noop_qdisc;
	379
	380	return tbf_change(sch, opt);
	381	}
	382
	383	static void tbf_destroy(struct Qdisc *sch)
	384	{
	385	struct tbf_sched_data *q = qdisc_priv(sch);
	386
f7f593e3	387	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	388	qdisc_destroy(q->qdisc);
	389	}
	390
	391	static int tbf_dump(struct Qdisc sch, struct sk_buff skb)
	392	{
	393	struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef	394	struct nlattr *nest;
1da177e4 LT	395	struct tc_tbf_qopt opt;
1da177e4 LT	396
b0460e44	397	sch->qstats.backlog = q->qdisc->qstats.backlog;
4b3550ef PM	398	nest = nla_nest_start(skb, TCA_OPTIONS);
	399	if (nest == NULL)
	400	goto nla_put_failure;
1da177e4 LT	401
1da177e4 LT	402	opt.limit = q->limit;
01cb71d2	403	psched_ratecfg_getrate(&opt.rate, &q->rate);
b757c933	404	if (q->peak_present)
01cb71d2	405	psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4 LT	406	else
1da177e4 LT	407	memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933 JP	408	opt.mtu = PSCHED_NS2TICKS(q->mtu);
b757c933 JP	409	opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43 DM	410	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
1b34ec43 DM	411	goto nla_put_failure;
a33c4a26 YY	412	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
	413	nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
	414	goto nla_put_failure;
	415	if (q->peak_present &&
	416	q->peak.rate_bytes_ps >= (1ULL << 32) &&
	417	nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
	418	goto nla_put_failure;
1da177e4	419
4b3550ef	420	nla_nest_end(skb, nest);
1da177e4 LT	421	return skb->len;
1da177e4 LT	422
1e90474c	423	nla_put_failure:
4b3550ef	424	nla_nest_cancel(skb, nest);
1da177e4 LT	425	return -1;
	426	}
	427
	428	static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
	429	struct sk_buff skb, struct tcmsg tcm)
	430	{
	431	struct tbf_sched_data *q = qdisc_priv(sch);
	432
1da177e4 LT	433	tcm->tcm_handle \|= TC_H_MIN(1);
	434	tcm->tcm_info = q->qdisc->handle;
	435
	436	return 0;
	437	}
	438
	439	static int tbf_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
	440	struct Qdisc **old)
	441	{
	442	struct tbf_sched_data *q = qdisc_priv(sch);
	443
	444	if (new == NULL)
	445	new = &noop_qdisc;
	446
	447	sch_tree_lock(sch);
b94c8afc PM	448	*old = q->qdisc;
b94c8afc PM	449	q->qdisc = new;
5e50da01	450	qdisc_tree_decrease_qlen(old, (old)->q.qlen);
1da177e4	451	qdisc_reset(*old);
1da177e4 LT	452	sch_tree_unlock(sch);
	453
	454	return 0;
	455	}
	456
	457	static struct Qdisc tbf_leaf(struct Qdisc sch, unsigned long arg)
	458	{
	459	struct tbf_sched_data *q = qdisc_priv(sch);
	460	return q->qdisc;
	461	}
	462
	463	static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
	464	{
	465	return 1;
	466	}
	467
	468	static void tbf_put(struct Qdisc *sch, unsigned long arg)
	469	{
	470	}
	471
1da177e4 LT	472	static void tbf_walk(struct Qdisc sch, struct qdisc_walker walker)
	473	{
	474	if (!walker->stop) {
	475	if (walker->count >= walker->skip)
	476	if (walker->fn(sch, 1, walker) < 0) {
	477	walker->stop = 1;
	478	return;
	479	}
	480	walker->count++;
	481	}
	482	}
	483
cc7ec456	484	static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4 LT	485	.graft = tbf_graft,
	486	.leaf = tbf_leaf,
	487	.get = tbf_get,
	488	.put = tbf_put,
1da177e4	489	.walk = tbf_walk,
1da177e4 LT	490	.dump = tbf_dump_class,
	491	};
	492
20fea08b	493	static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4 LT	494	.next = NULL,
	495	.cl_ops = &tbf_class_ops,
	496	.id = "tbf",
	497	.priv_size = sizeof(struct tbf_sched_data),
	498	.enqueue = tbf_enqueue,
	499	.dequeue = tbf_dequeue,
77be155c	500	.peek = qdisc_peek_dequeued,
1da177e4 LT	501	.drop = tbf_drop,
	502	.init = tbf_init,
	503	.reset = tbf_reset,
	504	.destroy = tbf_destroy,
	505	.change = tbf_change,
	506	.dump = tbf_dump,
	507	.owner = THIS_MODULE,
	508	};
	509
	510	static int __init tbf_module_init(void)
	511	{
	512	return register_qdisc(&tbf_qdisc_ops);
	513	}
	514
	515	static void __exit tbf_module_exit(void)
	516	{
	517	unregister_qdisc(&tbf_qdisc_ops);
	518	}
	519	module_init(tbf_module_init)
	520	module_exit(tbf_module_exit)
	521	MODULE_LICENSE("GPL");