[mirror_ubuntu-artful-kernel.git] / net / sched / sch_tbf.c

/*
 * net/sched/sch_tbf.c	Token Bucket Filter queue.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
 *						 original idea by Martin Devera
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>


/*	Simple Token Bucket Filter.
	=======================================

	SOURCE.
	-------

	None.

	Description.
	------------

	A data flow obeys TBF with rate R and depth B, if for any
	time interval t_i...t_f the number of transmitted bits
	does not exceed B + R*(t_f-t_i).

	Packetized version of this definition:
	The sequence of packets of sizes s_i served at moments t_i
	obeys TBF, if for any i<=k:

	s_i+....+s_k <= B + R*(t_k - t_i)

	Algorithm.
	----------

	Let N(t_i) be B/R initially and N(t) grow continuously with time as:

	N(t+delta) = min{B/R, N(t) + delta}

	If the first packet in queue has length S, it may be
	transmitted only at the time t_* when S/R <= N(t_*),
	and in this case N(t) jumps:

	N(t_* + 0) = N(t_* - 0) - S/R.


	Actually, QoS requires two TBF to be applied to a data stream.
	One of them controls steady state burst size, another
	one with rate P (peak rate) and depth M (equal to link MTU)
	limits bursts at a smaller time scale.

	It is easy to see that P>R, and B>M. If P is infinity, this double
	TBF is equivalent to a single one.

	When TBF works in reshaping mode, latency is estimated as:

	lat = max ((L-B)/R, (L-M)/P)


	NOTES.
	------

	If TBF throttles, it starts a watchdog timer, which will wake it up
	when it is ready to transmit.
	Note that the minimal timer resolution is 1/HZ.
	If no new packets arrive during this period,
	or if the device is not awaken by EOI for some previous packet,
	TBF can stop its activity for 1/HZ.


	This means, that with depth B, the maximal rate is

	R_crit = B*HZ

	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.

	Note that the peak rate TBF is much more tough: with MTU 1500
	P_crit = 150Kbytes/sec. So, if you need greater peak
	rates, use alpha with HZ=1000 :-)

	With classful TBF, limit is just kept for backwards compatibility.
	It is passed to the default bfifo qdisc - if the inner qdisc is
	changed the limit is not effective anymore.
*/

struct tbf_sched_data {
/* Parameters */
	u32		limit;		/* Maximal length of backlog: bytes */
	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
	s64		mtu;
	u32		max_size;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;
	bool peak_present;

/* Variables */
	s64	tokens;			/* Current number of B tokens */
	s64	ptokens;		/* Current number of P tokens */
	s64	t_c;			/* Time check-point */
	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
	struct qdisc_watchdog watchdog;	/* Watchdog timer */
};


/* Time to Length, convert time in ns to length in bytes
 * to determinate how many bytes can be sent in given time.
 */
static u64 psched_ns_t2l(const struct psched_ratecfg *r,
			 u64 time_in_ns)
{
	/* The formula is :
	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
	 */
	u64 len = time_in_ns * r->rate_bytes_ps;

	do_div(len, NSEC_PER_SEC);

	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
		do_div(len, 53);
		len = len * 48;
	}

	if (len > r->overhead)
		len -= r->overhead;
	else
		len = 0;

	return len;
}

/*
 * Return length of individual segments of a gso packet,
 * including all headers (MAC, IP, TCP/UDP)
 */
static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
{
	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
	return hdr_len + skb_gso_transport_seglen(skb);
}

/* GSO packet is too big, segment it so that tbf can transmit
 * each segment in time
 */
static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *segs, *nskb;
	netdev_features_t features = netif_skb_features(skb);
	int ret, nb;

	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);

	if (IS_ERR_OR_NULL(segs))
		return qdisc_reshape_fail(skb, sch);

	nb = 0;
	while (segs) {
		nskb = segs->next;
		segs->next = NULL;
		qdisc_skb_cb(segs)->pkt_len = segs->len;
		ret = qdisc_enqueue(segs, q->qdisc);
		if (ret != NET_XMIT_SUCCESS) {
			if (net_xmit_drop_count(ret))
				sch->qstats.drops++;
		} else {
			nb++;
		}
		segs = nskb;
	}
	sch->q.qlen += nb;
	if (nb > 1)
		qdisc_tree_decrease_qlen(sch, 1 - nb);
	consume_skb(skb);
	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}

static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	int ret;

	if (qdisc_pkt_len(skb) > q->max_size) {
		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
			return tbf_segment(skb, sch);
		return qdisc_reshape_fail(skb, sch);
	}
	ret = qdisc_enqueue(skb, q->qdisc);
	if (ret != NET_XMIT_SUCCESS) {
		if (net_xmit_drop_count(ret))
			sch->qstats.drops++;
		return ret;
	}

	sch->q.qlen++;
	return NET_XMIT_SUCCESS;
}

static unsigned int tbf_drop(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	unsigned int len = 0;

	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
		sch->q.qlen--;
		sch->qstats.drops++;
	}
	return len;
}

static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;

	skb = q->qdisc->ops->peek(q->qdisc);

	if (skb) {
		s64 now;
		s64 toks;
		s64 ptoks = 0;
		unsigned int len = qdisc_pkt_len(skb);

		now = ktime_to_ns(ktime_get());
		toks = min_t(s64, now - q->t_c, q->buffer);

		if (q->peak_present) {
			ptoks = toks + q->ptokens;
			if (ptoks > q->mtu)
				ptoks = q->mtu;
			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
		}
		toks += q->tokens;
		if (toks > q->buffer)
			toks = q->buffer;
		toks -= (s64) psched_l2t_ns(&q->rate, len);

		if ((toks|ptoks) >= 0) {
			skb = qdisc_dequeue_peeked(q->qdisc);
			if (unlikely(!skb))
				return NULL;

			q->t_c = now;
			q->tokens = toks;
			q->ptokens = ptoks;
			sch->q.qlen--;
			qdisc_unthrottled(sch);
			qdisc_bstats_update(sch, skb);
			return skb;
		}

		qdisc_watchdog_schedule_ns(&q->watchdog,
					   now + max_t(long, -toks, -ptoks));

		/* Maybe we have a shorter packet in the queue,
		   which can be sent now. It sounds cool,
		   but, however, this is wrong in principle.
		   We MUST NOT reorder packets under these circumstances.

		   Really, if we split the flow into independent
		   subflows, it would be a very good solution.
		   This is the main idea of all FQ algorithms
		   (cf. CSZ, HPFQ, HFSC)
		 */

		sch->qstats.overlimits++;
	}
	return NULL;
}

static void tbf_reset(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_reset(q->qdisc);
	sch->q.qlen = 0;
	q->t_c = ktime_to_ns(ktime_get());
	q->tokens = q->buffer;
	q->ptokens = q->mtu;
	qdisc_watchdog_cancel(&q->watchdog);
}

static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
	[TCA_TBF_BURST] = { .type = NLA_U32 },
	[TCA_TBF_PBURST] = { .type = NLA_U32 },
};

static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
	int err;
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_TBF_MAX + 1];
	struct tc_tbf_qopt *qopt;
	struct Qdisc *child = NULL;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;
	u64 max_size;
	s64 buffer, mtu;
	u64 rate64 = 0, prate64 = 0;

	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
	if (err < 0)
		return err;

	err = -EINVAL;
	if (tb[TCA_TBF_PARMS] == NULL)
		goto done;

	qopt = nla_data(tb[TCA_TBF_PARMS]);
	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
					      tb[TCA_TBF_RTAB]));

	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
						      tb[TCA_TBF_PTAB]));

	if (q->qdisc != &noop_qdisc) {
		err = fifo_set_limit(q->qdisc, qopt->limit);
		if (err)
			goto done;
	} else if (qopt->limit > 0) {
		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
		if (IS_ERR(child)) {
			err = PTR_ERR(child);
			goto done;
		}
	}

	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);

	if (tb[TCA_TBF_RATE64])
		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);

	if (tb[TCA_TBF_BURST]) {
		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
		buffer = psched_l2t_ns(&rate, max_size);
	} else {
		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
	}

	if (qopt->peakrate.rate) {
		if (tb[TCA_TBF_PRATE64])
			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
					peak.rate_bytes_ps, rate.rate_bytes_ps);
			err = -EINVAL;
			goto done;
		}

		if (tb[TCA_TBF_PBURST]) {
			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
			max_size = min_t(u32, max_size, pburst);
			mtu = psched_l2t_ns(&peak, pburst);
		} else {
			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
		}
	}

	if (max_size < psched_mtu(qdisc_dev(sch)))
		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
				    max_size, qdisc_dev(sch)->name,
				    psched_mtu(qdisc_dev(sch)));

	if (!max_size) {
		err = -EINVAL;
		goto done;
	}

	sch_tree_lock(sch);
	if (child) {
		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
		qdisc_destroy(q->qdisc);
		q->qdisc = child;
	}
	q->limit = qopt->limit;
	if (tb[TCA_TBF_PBURST])
		q->mtu = mtu;
	else
		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
	q->max_size = max_size;
	if (tb[TCA_TBF_BURST])
		q->buffer = buffer;
	else
		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
	q->tokens = q->buffer;
	q->ptokens = q->mtu;

	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
	if (qopt->peakrate.rate) {
		memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
		q->peak_present = true;
	} else {
		q->peak_present = false;
	}

	sch_tree_unlock(sch);
	err = 0;
done:
	return err;
}

static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (opt == NULL)
		return -EINVAL;

	q->t_c = ktime_to_ns(ktime_get());
	qdisc_watchdog_init(&q->watchdog, sch);
	q->qdisc = &noop_qdisc;

	return tbf_change(sch, opt);
}

static void tbf_destroy(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_watchdog_cancel(&q->watchdog);
	qdisc_destroy(q->qdisc);
}

static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *nest;
	struct tc_tbf_qopt opt;

	sch->qstats.backlog = q->qdisc->qstats.backlog;
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;

	opt.limit = q->limit;
	psched_ratecfg_getrate(&opt.rate, &q->rate);
	if (q->peak_present)
		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
	else
		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
	opt.mtu = PSCHED_NS2TICKS(q->mtu);
	opt.buffer = PSCHED_NS2TICKS(q->buffer);
	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
		goto nla_put_failure;
	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
		goto nla_put_failure;
	if (q->peak_present &&
	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
		goto nla_put_failure;

	nla_nest_end(skb, nest);
	return skb->len;

nla_put_failure:
	nla_nest_cancel(skb, nest);
	return -1;
}

static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
			  struct sk_buff *skb, struct tcmsg *tcm)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	tcm->tcm_handle |= TC_H_MIN(1);
	tcm->tcm_info = q->qdisc->handle;

	return 0;
}

static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
		     struct Qdisc **old)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (new == NULL)
		new = &noop_qdisc;

	sch_tree_lock(sch);
	*old = q->qdisc;
	q->qdisc = new;
	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
	qdisc_reset(*old);
	sch_tree_unlock(sch);

	return 0;
}

static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	return q->qdisc;
}

static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
{
	return 1;
}

static void tbf_put(struct Qdisc *sch, unsigned long arg)
{
}

static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
	if (!walker->stop) {
		if (walker->count >= walker->skip)
			if (walker->fn(sch, 1, walker) < 0) {
				walker->stop = 1;
				return;
			}
		walker->count++;
	}
}

static const struct Qdisc_class_ops tbf_class_ops = {
	.graft		=	tbf_graft,
	.leaf		=	tbf_leaf,
	.get		=	tbf_get,
	.put		=	tbf_put,
	.walk		=	tbf_walk,
	.dump		=	tbf_dump_class,
};

static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
	.next		=	NULL,
	.cl_ops		=	&tbf_class_ops,
	.id		=	"tbf",
	.priv_size	=	sizeof(struct tbf_sched_data),
	.enqueue	=	tbf_enqueue,
	.dequeue	=	tbf_dequeue,
	.peek		=	qdisc_peek_dequeued,
	.drop		=	tbf_drop,
	.init		=	tbf_init,
	.reset		=	tbf_reset,
	.destroy	=	tbf_destroy,
	.change		=	tbf_change,
	.dump		=	tbf_dump,
	.owner		=	THIS_MODULE,
};

static int __init tbf_module_init(void)
{
	return register_qdisc(&tbf_qdisc_ops);
}

static void __exit tbf_module_exit(void)
{
	unregister_qdisc(&tbf_qdisc_ops);
}
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");
Commit	Line	Data
1da177e4 LT	1	/*
	2	* net/sched/sch_tbf.c Token Bucket Filter queue.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
	11	* original idea by Martin Devera
	12	*
	13	*/
	14
1da177e4	15	#include <linux/module.h>
1da177e4 LT	16	#include <linux/types.h>
1da177e4 LT	17	#include <linux/kernel.h>
1da177e4	18	#include <linux/string.h>
1da177e4	19	#include <linux/errno.h>
1da177e4	20	#include <linux/skbuff.h>
0ba48053	21	#include <net/netlink.h>
b757c933	22	#include <net/sch_generic.h>
1da177e4 LT	23	#include <net/pkt_sched.h>
	24
	25
	26	/* Simple Token Bucket Filter.
	27	=======================================
	28
	29	SOURCE.
	30	-------
	31
	32	None.
	33
	34	Description.
	35	------------
	36
	37	A data flow obeys TBF with rate R and depth B, if for any
	38	time interval t_i...t_f the number of transmitted bits
	39	does not exceed B + R*(t_f-t_i).
	40
	41	Packetized version of this definition:
	42	The sequence of packets of sizes s_i served at moments t_i
	43	obeys TBF, if for any i<=k:
	44
	45	s_i+....+s_k <= B + R*(t_k - t_i)
	46
	47	Algorithm.
	48	----------
	49
	50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
	51
	52	N(t+delta) = min{B/R, N(t) + delta}
	53
	54	If the first packet in queue has length S, it may be
	55	transmitted only at the time t_* when S/R <= N(t_*),
	56	and in this case N(t) jumps:
	57
	58	N(t_* + 0) = N(t_* - 0) - S/R.
	59
	60
	61
	62	Actually, QoS requires two TBF to be applied to a data stream.
	63	One of them controls steady state burst size, another
	64	one with rate P (peak rate) and depth M (equal to link MTU)
	65	limits bursts at a smaller time scale.
	66
	67	It is easy to see that P>R, and B>M. If P is infinity, this double
	68	TBF is equivalent to a single one.
	69
	70	When TBF works in reshaping mode, latency is estimated as:
	71
	72	lat = max ((L-B)/R, (L-M)/P)
	73
	74
	75	NOTES.
	76	------
	77
	78	If TBF throttles, it starts a watchdog timer, which will wake it up
	79	when it is ready to transmit.
	80	Note that the minimal timer resolution is 1/HZ.
	81	If no new packets arrive during this period,
	82	or if the device is not awaken by EOI for some previous packet,
	83	TBF can stop its activity for 1/HZ.
	84
	85
	86	This means, that with depth B, the maximal rate is
87
88	R_crit = B*HZ
89
90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92	Note that the peak rate TBF is much more tough: with MTU 1500
93	P_crit = 150Kbytes/sec. So, if you need greater peak
94	rates, use alpha with HZ=1000 :-)
95
96	With classful TBF, limit is just kept for backwards compatibility.
97	It is passed to the default bfifo qdisc - if the inner qdisc is
98	changed the limit is not effective anymore.
99	*/
100
cc7ec456	101	struct tbf_sched_data {
1da177e4 LT	102	/* Parameters */
1da177e4 LT	103	u32 limit; /* Maximal length of backlog: bytes */
b757c933 JP	104	s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
b757c933 JP	105	s64 mtu;
1da177e4	106	u32 max_size;
b757c933 JP	107	struct psched_ratecfg rate;
	108	struct psched_ratecfg peak;
	109	bool peak_present;
1da177e4 LT	110
1da177e4 LT	111	/* Variables */
b757c933 JP	112	s64 tokens; /* Current number of B tokens */
	113	s64 ptokens; /* Current number of P tokens */
	114	s64 t_c; /* Time check-point */
1da177e4	115	struct Qdisc qdisc; / Inner qdisc, default - bfifo queue */
f7f593e3	116	struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4 LT	117	};
1da177e4 LT	118
e43ac79a	119
cc106e44 YY	120	/* Time to Length, convert time in ns to length in bytes
	121	* to determinate how many bytes can be sent in given time.
	122	*/
	123	static u64 psched_ns_t2l(const struct psched_ratecfg *r,
	124	u64 time_in_ns)
	125	{
	126	/* The formula is :
	127	* len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
	128	*/
	129	u64 len = time_in_ns * r->rate_bytes_ps;
	130
	131	do_div(len, NSEC_PER_SEC);
	132
d55d282e YY	133	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
	134	do_div(len, 53);
	135	len = len * 48;
	136	}
cc106e44 YY	137
	138	if (len > r->overhead)
	139	len -= r->overhead;
	140	else
	141	len = 0;
	142
	143	return len;
	144	}
	145
4d0820cf ED	146	/*
	147	* Return length of individual segments of a gso packet,
	148	* including all headers (MAC, IP, TCP/UDP)
	149	*/
de960aa9	150	static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
4d0820cf ED	151	{
4d0820cf ED	152	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
de960aa9	153	return hdr_len + skb_gso_transport_seglen(skb);
4d0820cf ED	154	}
4d0820cf ED	155
e43ac79a ED	156	/* GSO packet is too big, segment it so that tbf can transmit
	157	* each segment in time
	158	*/
	159	static int tbf_segment(struct sk_buff skb, struct Qdisc sch)
	160	{
	161	struct tbf_sched_data *q = qdisc_priv(sch);
	162	struct sk_buff segs, nskb;
	163	netdev_features_t features = netif_skb_features(skb);
	164	int ret, nb;
	165
	166	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
	167
	168	if (IS_ERR_OR_NULL(segs))
	169	return qdisc_reshape_fail(skb, sch);
	170
	171	nb = 0;
	172	while (segs) {
	173	nskb = segs->next;
	174	segs->next = NULL;
4d0820cf ED	175	qdisc_skb_cb(segs)->pkt_len = segs->len;
4d0820cf ED	176	ret = qdisc_enqueue(segs, q->qdisc);
e43ac79a ED	177	if (ret != NET_XMIT_SUCCESS) {
	178	if (net_xmit_drop_count(ret))
	179	sch->qstats.drops++;
	180	} else {
	181	nb++;
	182	}
	183	segs = nskb;
	184	}
	185	sch->q.qlen += nb;
	186	if (nb > 1)
	187	qdisc_tree_decrease_qlen(sch, 1 - nb);
	188	consume_skb(skb);
	189	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
	190	}
	191
cc7ec456	192	static int tbf_enqueue(struct sk_buff skb, struct Qdisc sch)
1da177e4 LT	193	{
	194	struct tbf_sched_data *q = qdisc_priv(sch);
	195	int ret;
	196
e43ac79a	197	if (qdisc_pkt_len(skb) > q->max_size) {
de960aa9	198	if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
e43ac79a	199	return tbf_segment(skb, sch);
69747650	200	return qdisc_reshape_fail(skb, sch);
e43ac79a	201	}
5f86173b	202	ret = qdisc_enqueue(skb, q->qdisc);
9871e50e	203	if (ret != NET_XMIT_SUCCESS) {
378a2f09 JP	204	if (net_xmit_drop_count(ret))
378a2f09 JP	205	sch->qstats.drops++;
1da177e4 LT	206	return ret;
	207	}
	208
	209	sch->q.qlen++;
9871e50e	210	return NET_XMIT_SUCCESS;
1da177e4 LT	211	}
1da177e4 LT	212
cc7ec456	213	static unsigned int tbf_drop(struct Qdisc *sch)
1da177e4 LT	214	{
1da177e4 LT	215	struct tbf_sched_data *q = qdisc_priv(sch);
6d037a26	216	unsigned int len = 0;
1da177e4	217
6d037a26	218	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
1da177e4 LT	219	sch->q.qlen--;
	220	sch->qstats.drops++;
	221	}
	222	return len;
	223	}
	224
cc7ec456	225	static struct sk_buff tbf_dequeue(struct Qdisc sch)
1da177e4 LT	226	{
	227	struct tbf_sched_data *q = qdisc_priv(sch);
	228	struct sk_buff *skb;
	229
03c05f0d	230	skb = q->qdisc->ops->peek(q->qdisc);
1da177e4 LT	231
1da177e4 LT	232	if (skb) {
b757c933 JP	233	s64 now;
	234	s64 toks;
	235	s64 ptoks = 0;
0abf77e5	236	unsigned int len = qdisc_pkt_len(skb);
1da177e4	237
b757c933 JP	238	now = ktime_to_ns(ktime_get());
b757c933 JP	239	toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4	240
b757c933	241	if (q->peak_present) {
1da177e4	242	ptoks = toks + q->ptokens;
b757c933	243	if (ptoks > q->mtu)
1da177e4	244	ptoks = q->mtu;
b757c933	245	ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4 LT	246	}
1da177e4 LT	247	toks += q->tokens;
b757c933	248	if (toks > q->buffer)
1da177e4	249	toks = q->buffer;
b757c933	250	toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4 LT	251
1da177e4 LT	252	if ((toks\|ptoks) >= 0) {
77be155c	253	skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d JP	254	if (unlikely(!skb))
	255	return NULL;
	256
1da177e4 LT	257	q->t_c = now;
	258	q->tokens = toks;
	259	q->ptokens = ptoks;
	260	sch->q.qlen--;
fd245a4a	261	qdisc_unthrottled(sch);
9190b3b3	262	qdisc_bstats_update(sch, skb);
1da177e4 LT	263	return skb;
	264	}
	265
b757c933 JP	266	qdisc_watchdog_schedule_ns(&q->watchdog,
b757c933 JP	267	now + max_t(long, -toks, -ptoks));
1da177e4 LT	268
	269	/* Maybe we have a shorter packet in the queue,
	270	which can be sent now. It sounds cool,
	271	but, however, this is wrong in principle.
	272	We MUST NOT reorder packets under these circumstances.
	273
	274	Really, if we split the flow into independent
	275	subflows, it would be a very good solution.
	276	This is the main idea of all FQ algorithms
	277	(cf. CSZ, HPFQ, HFSC)
	278	*/
	279
1da177e4 LT	280	sch->qstats.overlimits++;
	281	}
	282	return NULL;
	283	}
	284
cc7ec456	285	static void tbf_reset(struct Qdisc *sch)
1da177e4 LT	286	{
	287	struct tbf_sched_data *q = qdisc_priv(sch);
	288
	289	qdisc_reset(q->qdisc);
	290	sch->q.qlen = 0;
b757c933	291	q->t_c = ktime_to_ns(ktime_get());
1da177e4 LT	292	q->tokens = q->buffer;
1da177e4 LT	293	q->ptokens = q->mtu;
f7f593e3	294	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	295	}
1da177e4 LT	296
27a3421e PM	297	static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	298	[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
	299	[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	300	[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
a33c4a26 YY	301	[TCA_TBF_RATE64] = { .type = NLA_U64 },
a33c4a26 YY	302	[TCA_TBF_PRATE64] = { .type = NLA_U64 },
2e04ad42 YY	303	[TCA_TBF_BURST] = { .type = NLA_U32 },
2e04ad42 YY	304	[TCA_TBF_PBURST] = { .type = NLA_U32 },
27a3421e PM	305	};
27a3421e PM	306
cc7ec456	307	static int tbf_change(struct Qdisc sch, struct nlattr opt)
1da177e4	308	{
cee63723	309	int err;
1da177e4	310	struct tbf_sched_data *q = qdisc_priv(sch);
a33c4a26	311	struct nlattr *tb[TCA_TBF_MAX + 1];
1da177e4	312	struct tc_tbf_qopt *qopt;
1da177e4	313	struct Qdisc *child = NULL;
cc106e44 YY	314	struct psched_ratecfg rate;
	315	struct psched_ratecfg peak;
	316	u64 max_size;
	317	s64 buffer, mtu;
a33c4a26	318	u64 rate64 = 0, prate64 = 0;
1da177e4	319
a33c4a26	320	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
cee63723 PM	321	if (err < 0)
	322	return err;
	323
	324	err = -EINVAL;
27a3421e	325	if (tb[TCA_TBF_PARMS] == NULL)
1da177e4 LT	326	goto done;
1da177e4 LT	327
1e90474c	328	qopt = nla_data(tb[TCA_TBF_PARMS]);
cc106e44 YY	329	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
	330	qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
	331	tb[TCA_TBF_RTAB]));
1da177e4	332
cc106e44 YY	333	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
	334	qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
	335	tb[TCA_TBF_PTAB]));
4d0820cf	336
f0cd1508	337	if (q->qdisc != &noop_qdisc) {
	338	err = fifo_set_limit(q->qdisc, qopt->limit);
	339	if (err)
	340	goto done;
	341	} else if (qopt->limit > 0) {
fb0305ce PM	342	child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
	343	if (IS_ERR(child)) {
	344	err = PTR_ERR(child);
1da177e4	345	goto done;
fb0305ce	346	}
1da177e4 LT	347	}
1da177e4 LT	348
cc106e44 YY	349	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
	350	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
	351
	352	if (tb[TCA_TBF_RATE64])
	353	rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	354	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
	355
2e04ad42 YY	356	if (tb[TCA_TBF_BURST]) {
	357	max_size = nla_get_u32(tb[TCA_TBF_BURST]);
	358	buffer = psched_l2t_ns(&rate, max_size);
	359	} else {
	360	max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
	361	}
cc106e44 YY	362
	363	if (qopt->peakrate.rate) {
	364	if (tb[TCA_TBF_PRATE64])
	365	prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
	366	psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
	367	if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
	368	pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
2e04ad42	369	peak.rate_bytes_ps, rate.rate_bytes_ps);
cc106e44 YY	370	err = -EINVAL;
	371	goto done;
	372	}
	373
2e04ad42 YY	374	if (tb[TCA_TBF_PBURST]) {
	375	u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
	376	max_size = min_t(u32, max_size, pburst);
	377	mtu = psched_l2t_ns(&peak, pburst);
	378	} else {
	379	max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
	380	}
cc106e44 YY	381	}
	382
	383	if (max_size < psched_mtu(qdisc_dev(sch)))
	384	pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
	385	max_size, qdisc_dev(sch)->name,
	386	psched_mtu(qdisc_dev(sch)));
	387
	388	if (!max_size) {
	389	err = -EINVAL;
	390	goto done;
	391	}
	392
1da177e4	393	sch_tree_lock(sch);
5e50da01 PM	394	if (child) {
5e50da01 PM	395	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
b94c8afc PM	396	qdisc_destroy(q->qdisc);
b94c8afc PM	397	q->qdisc = child;
5e50da01	398	}
1da177e4	399	q->limit = qopt->limit;
2e04ad42 YY	400	if (tb[TCA_TBF_PBURST])
	401	q->mtu = mtu;
	402	else
	403	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4	404	q->max_size = max_size;
2e04ad42 YY	405	if (tb[TCA_TBF_BURST])
	406	q->buffer = buffer;
	407	else
	408	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4 LT	409	q->tokens = q->buffer;
1da177e4 LT	410	q->ptokens = q->mtu;
b94c8afc	411
cc106e44 YY	412	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
	413	if (qopt->peakrate.rate) {
	414	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
b757c933 JP	415	q->peak_present = true;
	416	} else {
	417	q->peak_present = false;
	418	}
b94c8afc	419
1da177e4 LT	420	sch_tree_unlock(sch);
	421	err = 0;
	422	done:
1da177e4 LT	423	return err;
	424	}
	425
cc7ec456	426	static int tbf_init(struct Qdisc sch, struct nlattr opt)
1da177e4 LT	427	{
	428	struct tbf_sched_data *q = qdisc_priv(sch);
	429
	430	if (opt == NULL)
	431	return -EINVAL;
	432
b757c933	433	q->t_c = ktime_to_ns(ktime_get());
f7f593e3	434	qdisc_watchdog_init(&q->watchdog, sch);
1da177e4 LT	435	q->qdisc = &noop_qdisc;
	436
	437	return tbf_change(sch, opt);
	438	}
	439
	440	static void tbf_destroy(struct Qdisc *sch)
	441	{
	442	struct tbf_sched_data *q = qdisc_priv(sch);
	443
f7f593e3	444	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	445	qdisc_destroy(q->qdisc);
	446	}
	447
	448	static int tbf_dump(struct Qdisc sch, struct sk_buff skb)
	449	{
	450	struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef	451	struct nlattr *nest;
1da177e4 LT	452	struct tc_tbf_qopt opt;
1da177e4 LT	453
b0460e44	454	sch->qstats.backlog = q->qdisc->qstats.backlog;
4b3550ef PM	455	nest = nla_nest_start(skb, TCA_OPTIONS);
	456	if (nest == NULL)
	457	goto nla_put_failure;
1da177e4 LT	458
1da177e4 LT	459	opt.limit = q->limit;
01cb71d2	460	psched_ratecfg_getrate(&opt.rate, &q->rate);
b757c933	461	if (q->peak_present)
01cb71d2	462	psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4 LT	463	else
1da177e4 LT	464	memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933 JP	465	opt.mtu = PSCHED_NS2TICKS(q->mtu);
b757c933 JP	466	opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43 DM	467	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
1b34ec43 DM	468	goto nla_put_failure;
a33c4a26 YY	469	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
	470	nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
	471	goto nla_put_failure;
	472	if (q->peak_present &&
	473	q->peak.rate_bytes_ps >= (1ULL << 32) &&
	474	nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
	475	goto nla_put_failure;
1da177e4	476
4b3550ef	477	nla_nest_end(skb, nest);
1da177e4 LT	478	return skb->len;
1da177e4 LT	479
1e90474c	480	nla_put_failure:
4b3550ef	481	nla_nest_cancel(skb, nest);
1da177e4 LT	482	return -1;
	483	}
	484
	485	static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
	486	struct sk_buff skb, struct tcmsg tcm)
	487	{
	488	struct tbf_sched_data *q = qdisc_priv(sch);
	489
1da177e4 LT	490	tcm->tcm_handle \|= TC_H_MIN(1);
	491	tcm->tcm_info = q->qdisc->handle;
	492
	493	return 0;
	494	}
	495
	496	static int tbf_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
	497	struct Qdisc **old)
	498	{
	499	struct tbf_sched_data *q = qdisc_priv(sch);
	500
	501	if (new == NULL)
	502	new = &noop_qdisc;
	503
	504	sch_tree_lock(sch);
b94c8afc PM	505	*old = q->qdisc;
b94c8afc PM	506	q->qdisc = new;
5e50da01	507	qdisc_tree_decrease_qlen(old, (old)->q.qlen);
1da177e4	508	qdisc_reset(*old);
1da177e4 LT	509	sch_tree_unlock(sch);
	510
	511	return 0;
	512	}
	513
	514	static struct Qdisc tbf_leaf(struct Qdisc sch, unsigned long arg)
	515	{
	516	struct tbf_sched_data *q = qdisc_priv(sch);
	517	return q->qdisc;
	518	}
	519
	520	static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
	521	{
	522	return 1;
	523	}
	524
	525	static void tbf_put(struct Qdisc *sch, unsigned long arg)
	526	{
	527	}
	528
1da177e4 LT	529	static void tbf_walk(struct Qdisc sch, struct qdisc_walker walker)
	530	{
	531	if (!walker->stop) {
	532	if (walker->count >= walker->skip)
	533	if (walker->fn(sch, 1, walker) < 0) {
	534	walker->stop = 1;
	535	return;
	536	}
	537	walker->count++;
	538	}
	539	}
	540
cc7ec456	541	static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4 LT	542	.graft = tbf_graft,
	543	.leaf = tbf_leaf,
	544	.get = tbf_get,
	545	.put = tbf_put,
1da177e4	546	.walk = tbf_walk,
1da177e4 LT	547	.dump = tbf_dump_class,
	548	};
	549
20fea08b	550	static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4 LT	551	.next = NULL,
	552	.cl_ops = &tbf_class_ops,
	553	.id = "tbf",
	554	.priv_size = sizeof(struct tbf_sched_data),
	555	.enqueue = tbf_enqueue,
	556	.dequeue = tbf_dequeue,
77be155c	557	.peek = qdisc_peek_dequeued,
1da177e4 LT	558	.drop = tbf_drop,
	559	.init = tbf_init,
	560	.reset = tbf_reset,
	561	.destroy = tbf_destroy,
	562	.change = tbf_change,
	563	.dump = tbf_dump,
	564	.owner = THIS_MODULE,
	565	};
	566
	567	static int __init tbf_module_init(void)
	568	{
	569	return register_qdisc(&tbf_qdisc_ops);
	570	}
	571
	572	static void __exit tbf_module_exit(void)
	573	{
	574	unregister_qdisc(&tbf_qdisc_ops);
	575	}
	576	module_init(tbf_module_init)
	577	module_exit(tbf_module_exit)
	578	MODULE_LICENSE("GPL");