openwrtv4/target/linux/generic/pending-4.4/032-fq_codel-add-batch-ability-to-fq_codel_drop.patch

From: Eric Dumazet <edumazet@google.com>
Date: Sun, 1 May 2016 16:47:26 -0700
Subject: [PATCH] fq_codel: add batch ability to fq_codel_drop()

In presence of inelastic flows and stress, we can call
fq_codel_drop() for every packet entering fq_codel qdisc.

fq_codel_drop() is quite expensive, as it does a linear scan
of 4 KB of memory to find a fat flow.
Once found, it drops the oldest packet of this flow.

Instead of dropping a single packet, try to drop 50% of the backlog
of this fat flow, with a configurable limit of 64 packets per round.

TCA_FQ_CODEL_DROP_BATCH_SIZE is the new attribute to make this
limit configurable.

With this strategy the 4 KB search is amortized to a single cache line
per drop [1], so fq_codel_drop() no longer appears at the top of kernel
profile in presence of few inelastic flows.

[1] Assuming a 64byte cache line, and 1024 buckets

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dave Taht <dave.taht@gmail.com>
Cc: Jonathan Morton <chromatix99@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Dave Taht
Signed-off-by: David S. Miller <davem@davemloft.net>
---

--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -711,6 +711,7 @@ enum {
 	TCA_FQ_CODEL_FLOWS,
 	TCA_FQ_CODEL_QUANTUM,
 	TCA_FQ_CODEL_CE_THRESHOLD,
+	TCA_FQ_CODEL_DROP_BATCH_SIZE,
 	__TCA_FQ_CODEL_MAX
 };
 
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -57,6 +57,7 @@ struct fq_codel_sched_data {
 	u32		flows_cnt;	/* number of flows */
 	u32		perturbation;	/* hash perturbation */
 	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
+	u32		drop_batch_size;
 	struct codel_params cparams;
 	struct codel_stats cstats;
 	u32		drop_overlimit;
@@ -133,17 +134,20 @@ static inline void flow_queue_add(struct
 	skb->next = NULL;
 }
 
-static unsigned int fq_codel_drop(struct Qdisc *sch)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 	unsigned int maxbacklog = 0, idx = 0, i, len;
 	struct fq_codel_flow *flow;
+	unsigned int threshold;
 
-	/* Queue is full! Find the fat flow and drop packet from it.
+	/* Queue is full! Find the fat flow and drop packet(s) from it.
 	 * This might sound expensive, but with 1024 flows, we scan
 	 * 4KB of memory, and we dont need to handle a complex tree
 	 * in fast path (packet queue/enqueue) with many cache misses.
+	 * In stress mode, we'll try to drop 64 packets from the flow,
+	 * amortizing this linear lookup to one cache line per drop.
 	 */
 	for (i = 0; i < q->flows_cnt; i++) {
 		if (q->backlogs[i] > maxbacklog) {
@@ -151,15 +155,24 @@ static unsigned int fq_codel_drop(struct
 			idx = i;
 		}
 	}
+
+	/* Our goal is to drop half of this fat flow backlog */
+	threshold = maxbacklog >> 1;
+
 	flow = &q->flows[idx];
-	skb = dequeue_head(flow);
-	len = qdisc_pkt_len(skb);
+	len = 0;
+	i = 0;
+	do {
+		skb = dequeue_head(flow);
+		len += qdisc_pkt_len(skb);
+		kfree_skb(skb);
+	} while (++i < max_packets && len < threshold);
+
+	flow->dropped += i;
 	q->backlogs[idx] -= len;
-	sch->q.qlen--;
-	qdisc_qstats_drop(sch);
-	qdisc_qstats_backlog_dec(sch, skb);
-	kfree_skb(skb);
-	flow->dropped++;
+	sch->qstats.drops += i;
+	sch->qstats.backlog -= len;
+	sch->q.qlen -= i;
 	return idx;
 }
 
@@ -168,14 +181,14 @@ static unsigned int fq_codel_qdisc_drop(
 	unsigned int prev_backlog;
 
 	prev_backlog = sch->qstats.backlog;
-	fq_codel_drop(sch);
+	fq_codel_drop(sch, 1U);
 	return prev_backlog - sch->qstats.backlog;
 }
 
 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
-	unsigned int idx, prev_backlog;
+	unsigned int idx, prev_backlog, prev_qlen;
 	struct fq_codel_flow *flow;
 	int uninitialized_var(ret);
 
@@ -204,16 +217,22 @@ static int fq_codel_enqueue(struct sk_bu
 		return NET_XMIT_SUCCESS;
 
 	prev_backlog = sch->qstats.backlog;
-	q->drop_overlimit++;
-	/* Return Congestion Notification only if we dropped a packet
-	 * from this flow.
+	prev_qlen = sch->q.qlen;
+
+	/* fq_codel_drop() is quite expensive, as it performs a linear search
+	 * in q->backlogs[] to find a fat flow.
+	 * So instead of dropping a single packet, drop half of its backlog
+	 * with a 64 packets limit to not add a too big cpu spike here.
 	 */
-	if (fq_codel_drop(sch) == idx)
-		return NET_XMIT_CN;
+	ret = fq_codel_drop(sch, q->drop_batch_size);
+
+	q->drop_overlimit += prev_qlen - sch->q.qlen;
 
-	/* As we dropped a packet, better let upper stack know this */
-	qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
-	return NET_XMIT_SUCCESS;
+	/* As we dropped packet(s), better let upper stack know this */
+	qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,
+				  prev_backlog - sch->qstats.backlog);
+
+	return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;
 }
 
 /* This is the specific function called from codel_dequeue()
@@ -323,6 +342,7 @@ static const struct nla_policy fq_codel_
 	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
+	[TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },
 };
 
 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -374,6 +394,9 @@ static int fq_codel_change(struct Qdisc
 	if (tb[TCA_FQ_CODEL_QUANTUM])
 		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
 
+	if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
+		q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
+
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = fq_codel_dequeue(sch);
 
@@ -419,6 +442,7 @@ static int fq_codel_init(struct Qdisc *s
 
 	sch->limit = 10*1024;
 	q->flows_cnt = 1024;
+	q->drop_batch_size = 64;
 	q->quantum = psched_mtu(qdisc_dev(sch));
 	q->perturbation = prandom_u32();
 	INIT_LIST_HEAD(&q->new_flows);
@@ -476,6 +500,8 @@ static int fq_codel_dump(struct Qdisc *s
 			q->cparams.ecn) ||
 	    nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
 			q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,
+			q->drop_batch_size) ||
 	    nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
 			q->flows_cnt))
 		goto nla_put_failure;
kernel: backport patches improving fq_codel drop behavior Signed-off-by: Felix Fietkau <nbd@nbd.name> 2016-05-12 19:04:50 +00:00			`From: Eric Dumazet <edumazet@google.com>`
			`Date: Sun, 1 May 2016 16:47:26 -0700`
			`Subject: [PATCH] fq_codel: add batch ability to fq_codel_drop()`

			`In presence of inelastic flows and stress, we can call`
			`fq_codel_drop() for every packet entering fq_codel qdisc.`

			`fq_codel_drop() is quite expensive, as it does a linear scan`
			`of 4 KB of memory to find a fat flow.`
			`Once found, it drops the oldest packet of this flow.`

			`Instead of dropping a single packet, try to drop 50% of the backlog`
			`of this fat flow, with a configurable limit of 64 packets per round.`

			`TCA_FQ_CODEL_DROP_BATCH_SIZE is the new attribute to make this`
			`limit configurable.`

			`With this strategy the 4 KB search is amortized to a single cache line`
			`per drop [1], so fq_codel_drop() no longer appears at the top of kernel`
			`profile in presence of few inelastic flows.`

			`[1] Assuming a 64byte cache line, and 1024 buckets`

			`Signed-off-by: Eric Dumazet <edumazet@google.com>`
			`Reported-by: Dave Taht <dave.taht@gmail.com>`
			`Cc: Jonathan Morton <chromatix99@gmail.com>`
			`Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>`
			`Acked-by: Dave Taht`
			`Signed-off-by: David S. Miller <davem@davemloft.net>`
			`---`

			`--- a/include/uapi/linux/pkt_sched.h`
			`+++ b/include/uapi/linux/pkt_sched.h`
			`@@ -711,6 +711,7 @@ enum {`
			`TCA_FQ_CODEL_FLOWS,`
			`TCA_FQ_CODEL_QUANTUM,`
			`TCA_FQ_CODEL_CE_THRESHOLD,`
			`+ TCA_FQ_CODEL_DROP_BATCH_SIZE,`
			`__TCA_FQ_CODEL_MAX`
			`};`

			`--- a/net/sched/sch_fq_codel.c`
			`+++ b/net/sched/sch_fq_codel.c`
			`@@ -57,6 +57,7 @@ struct fq_codel_sched_data {`
			`u32 flows_cnt; /* number of flows */`
			`u32 perturbation; /* hash perturbation */`
			`u32 quantum; /* psched_mtu(qdisc_dev(sch)); */`
			`+ u32 drop_batch_size;`
			`struct codel_params cparams;`
			`struct codel_stats cstats;`
			`u32 drop_overlimit;`
			`@@ -133,17 +134,20 @@ static inline void flow_queue_add(struct`
			`skb->next = NULL;`
			`}`

			`-static unsigned int fq_codel_drop(struct Qdisc *sch)`
			`+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)`
			`{`
			`struct fq_codel_sched_data *q = qdisc_priv(sch);`
			`struct sk_buff *skb;`
			`unsigned int maxbacklog = 0, idx = 0, i, len;`
			`struct fq_codel_flow *flow;`
			`+ unsigned int threshold;`

			`- /* Queue is full! Find the fat flow and drop packet from it.`
			`+ /* Queue is full! Find the fat flow and drop packet(s) from it.`
			`* This might sound expensive, but with 1024 flows, we scan`
			`* 4KB of memory, and we dont need to handle a complex tree`
			`* in fast path (packet queue/enqueue) with many cache misses.`
			`+ * In stress mode, we'll try to drop 64 packets from the flow,`
			`+ * amortizing this linear lookup to one cache line per drop.`
			`*/`
			`for (i = 0; i < q->flows_cnt; i++) {`
			`if (q->backlogs[i] > maxbacklog) {`
			`@@ -151,15 +155,24 @@ static unsigned int fq_codel_drop(struct`
			`idx = i;`
			`}`
			`}`
			`+`
			`+ /* Our goal is to drop half of this fat flow backlog */`
			`+ threshold = maxbacklog >> 1;`
			`+`
			`flow = &q->flows[idx];`
			`- skb = dequeue_head(flow);`
			`- len = qdisc_pkt_len(skb);`
			`+ len = 0;`
			`+ i = 0;`
			`+ do {`
			`+ skb = dequeue_head(flow);`
			`+ len += qdisc_pkt_len(skb);`
			`+ kfree_skb(skb);`
			`+ } while (++i < max_packets && len < threshold);`
			`+`
			`+ flow->dropped += i;`
			`q->backlogs[idx] -= len;`
			`- sch->q.qlen--;`
			`- qdisc_qstats_drop(sch);`
			`- qdisc_qstats_backlog_dec(sch, skb);`
			`- kfree_skb(skb);`
			`- flow->dropped++;`
			`+ sch->qstats.drops += i;`
			`+ sch->qstats.backlog -= len;`
			`+ sch->q.qlen -= i;`
			`return idx;`
			`}`

			`@@ -168,14 +181,14 @@ static unsigned int fq_codel_qdisc_drop(`
			`unsigned int prev_backlog;`

			`prev_backlog = sch->qstats.backlog;`
			`- fq_codel_drop(sch);`
			`+ fq_codel_drop(sch, 1U);`
			`return prev_backlog - sch->qstats.backlog;`
			`}`

			`static int fq_codel_enqueue(struct sk_buff skb, struct Qdisc sch)`
			`{`
			`struct fq_codel_sched_data *q = qdisc_priv(sch);`
			`- unsigned int idx, prev_backlog;`
			`+ unsigned int idx, prev_backlog, prev_qlen;`
			`struct fq_codel_flow *flow;`
			`int uninitialized_var(ret);`

			`@@ -204,16 +217,22 @@ static int fq_codel_enqueue(struct sk_bu`
			`return NET_XMIT_SUCCESS;`

			`prev_backlog = sch->qstats.backlog;`
			`- q->drop_overlimit++;`
			`- /* Return Congestion Notification only if we dropped a packet`
			`- * from this flow.`
			`+ prev_qlen = sch->q.qlen;`
			`+`
			`+ /* fq_codel_drop() is quite expensive, as it performs a linear search`
			`+ * in q->backlogs[] to find a fat flow.`
			`+ * So instead of dropping a single packet, drop half of its backlog`
			`+ * with a 64 packets limit to not add a too big cpu spike here.`
			`*/`
			`- if (fq_codel_drop(sch) == idx)`
			`- return NET_XMIT_CN;`
			`+ ret = fq_codel_drop(sch, q->drop_batch_size);`
kernel: update kernel 4.4 to version 4.4.11 Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com> 2016-05-19 17:32:46 +00:00			`+`
			`+ q->drop_overlimit += prev_qlen - sch->q.qlen;`
kernel: backport patches improving fq_codel drop behavior Signed-off-by: Felix Fietkau <nbd@nbd.name> 2016-05-12 19:04:50 +00:00
			`- /* As we dropped a packet, better let upper stack know this */`
			`- qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);`
			`- return NET_XMIT_SUCCESS;`
			`+ /* As we dropped packet(s), better let upper stack know this */`
			`+ qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,`
			`+ prev_backlog - sch->qstats.backlog);`
			`+`
			`+ return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;`
			`}`

			`/* This is the specific function called from codel_dequeue()`
			`@@ -323,6 +342,7 @@ static const struct nla_policy fq_codel_`
			`[TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },`
			`[TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },`
			`[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },`
			`+ [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },`
			`};`

			`static int fq_codel_change(struct Qdisc sch, struct nlattr opt)`
			`@@ -374,6 +394,9 @@ static int fq_codel_change(struct Qdisc`
			`if (tb[TCA_FQ_CODEL_QUANTUM])`
			`q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));`

			`+ if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])`
			`+ q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));`
			`+`
			`while (sch->q.qlen > sch->limit) {`
			`struct sk_buff *skb = fq_codel_dequeue(sch);`

			`@@ -419,6 +442,7 @@ static int fq_codel_init(struct Qdisc *s`

			`sch->limit = 10*1024;`
			`q->flows_cnt = 1024;`
			`+ q->drop_batch_size = 64;`
			`q->quantum = psched_mtu(qdisc_dev(sch));`
			`q->perturbation = prandom_u32();`
			`INIT_LIST_HEAD(&q->new_flows);`
			`@@ -476,6 +500,8 @@ static int fq_codel_dump(struct Qdisc *s`
			`q->cparams.ecn) \|\|`
			`nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,`
			`q->quantum) \|\|`
			`+ nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,`
			`+ q->drop_batch_size) \|\|`
			`nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,`
			`q->flows_cnt))`
			`goto nla_put_failure;`