net: sched: do not acquire qdisc spinlock in qdisc/class stats dump
Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host agent [1] are problematic at scale : For each qdisc/class found in the dump, we currently lock the root qdisc spinlock in order to get stats. Sampling stats every 5 seconds from thousands of HTB classes is a challenge when the root qdisc spinlock is under high pressure. Not only the dumps take time, they also slow down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases. An audit of existing qdiscs showed that sch_fq_codel is the only qdisc that might need the qdisc lock in fq_codel_dump_stats() and fq_codel_dump_class_stats() In v2 of this patch, I now use the Qdisc running seqcount to provide consistent reads of packets/bytes counters, regardless of 32/64 bit arches. I also changed rate estimators to use the same infrastructure so that they no longer need to lock root qdisc lock. [1] http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: John Fastabend <john.fastabend@gmail.com> Cc: Kevin Athey <kda@google.com> Cc: Xiaotian Pei <xiaotian@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:

committed by
David S. Miller

parent
f9eb8aea2a
commit
edb09eb17e
@@ -287,7 +287,7 @@ err2:
|
||||
if (est) {
|
||||
err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
|
||||
&p->tcfc_rate_est,
|
||||
&p->tcfc_lock, est);
|
||||
&p->tcfc_lock, NULL, est);
|
||||
if (err) {
|
||||
free_percpu(p->cpu_qstats);
|
||||
goto err2;
|
||||
@@ -671,7 +671,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
|
||||
&p->tcfc_rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(&d, p->cpu_qstats,
|
||||
|
@@ -185,7 +185,8 @@ override:
|
||||
if (est) {
|
||||
err = gen_replace_estimator(&police->tcf_bstats, NULL,
|
||||
&police->tcf_rate_est,
|
||||
&police->tcf_lock, est);
|
||||
&police->tcf_lock,
|
||||
NULL, est);
|
||||
if (err)
|
||||
goto failure_unlock;
|
||||
} else if (tb[TCA_POLICE_AVRATE] &&
|
||||
|
@@ -982,7 +982,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
|
||||
rcu_assign_pointer(sch->stab, stab);
|
||||
}
|
||||
if (tca[TCA_RATE]) {
|
||||
spinlock_t *root_lock;
|
||||
seqcount_t *running;
|
||||
|
||||
err = -EOPNOTSUPP;
|
||||
if (sch->flags & TCQ_F_MQROOT)
|
||||
@@ -991,14 +991,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
|
||||
if ((sch->parent != TC_H_ROOT) &&
|
||||
!(sch->flags & TCQ_F_INGRESS) &&
|
||||
(!p || !(p->flags & TCQ_F_MQROOT)))
|
||||
root_lock = qdisc_root_sleeping_lock(sch);
|
||||
running = qdisc_root_sleeping_running(sch);
|
||||
else
|
||||
root_lock = qdisc_lock(sch);
|
||||
running = &sch->running;
|
||||
|
||||
err = gen_new_estimator(&sch->bstats,
|
||||
sch->cpu_bstats,
|
||||
&sch->rate_est,
|
||||
root_lock,
|
||||
NULL,
|
||||
running,
|
||||
tca[TCA_RATE]);
|
||||
if (err)
|
||||
goto err_out4;
|
||||
@@ -1061,7 +1062,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
|
||||
gen_replace_estimator(&sch->bstats,
|
||||
sch->cpu_bstats,
|
||||
&sch->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
}
|
||||
out:
|
||||
@@ -1369,8 +1371,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
|
||||
goto nla_put_failure;
|
||||
|
||||
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
|
||||
qdisc_root_sleeping_lock(q), &d,
|
||||
TCA_PAD) < 0)
|
||||
NULL, &d, TCA_PAD) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
|
||||
@@ -1381,7 +1382,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
|
||||
cpu_qstats = q->cpu_qstats;
|
||||
}
|
||||
|
||||
if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
|
||||
&d, cpu_bstats, &q->bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
|
||||
goto nla_put_failure;
|
||||
@@ -1684,8 +1686,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
|
||||
goto nla_put_failure;
|
||||
|
||||
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
|
||||
qdisc_root_sleeping_lock(q), &d,
|
||||
TCA_PAD) < 0)
|
||||
NULL, &d, TCA_PAD) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
|
||||
|
@@ -637,7 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
|
||||
{
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &flow->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
|
||||
return -1;
|
||||
|
||||
|
@@ -1600,7 +1600,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
|
||||
if (cl->undertime != PSCHED_PASTPERFECT)
|
||||
cl->xstats.undertime = cl->undertime - q->now;
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &cl->bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
|
||||
return -1;
|
||||
@@ -1755,7 +1756,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_replace_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err) {
|
||||
qdisc_put_rtab(rtab);
|
||||
@@ -1848,7 +1850,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
|
||||
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err) {
|
||||
kfree(cl);
|
||||
|
@@ -91,7 +91,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_replace_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err)
|
||||
return err;
|
||||
@@ -119,7 +120,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err) {
|
||||
qdisc_destroy(cl->qdisc);
|
||||
@@ -279,7 +281,8 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
|
||||
if (qlen)
|
||||
xstats.deficit = cl->deficit;
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &cl->bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
|
||||
return -1;
|
||||
|
@@ -566,11 +566,13 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
st.qdisc_stats.memory_usage = q->memory_usage;
|
||||
st.qdisc_stats.drop_overmemory = q->drop_overmemory;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
list_for_each(pos, &q->new_flows)
|
||||
st.qdisc_stats.new_flows_len++;
|
||||
|
||||
list_for_each(pos, &q->old_flows)
|
||||
st.qdisc_stats.old_flows_len++;
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
@@ -624,7 +626,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
|
||||
if (idx < q->flows_cnt) {
|
||||
const struct fq_codel_flow *flow = &q->flows[idx];
|
||||
const struct sk_buff *skb = flow->head;
|
||||
const struct sk_buff *skb;
|
||||
|
||||
memset(&xstats, 0, sizeof(xstats));
|
||||
xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
|
||||
@@ -642,9 +644,14 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
codel_time_to_us(delta) :
|
||||
-codel_time_to_us(-delta);
|
||||
}
|
||||
while (skb) {
|
||||
qs.qlen++;
|
||||
skb = skb->next;
|
||||
if (flow->head) {
|
||||
sch_tree_lock(sch);
|
||||
skb = flow->head;
|
||||
while (skb) {
|
||||
qs.qlen++;
|
||||
skb = skb->next;
|
||||
}
|
||||
sch_tree_unlock(sch);
|
||||
}
|
||||
qs.backlog = q->backlogs[idx];
|
||||
qs.drops = flow->dropped;
|
||||
|
@@ -1015,11 +1015,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
cur_time = psched_get_time();
|
||||
|
||||
if (tca[TCA_RATE]) {
|
||||
spinlock_t *lock = qdisc_root_sleeping_lock(sch);
|
||||
|
||||
err = gen_replace_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
lock,
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err)
|
||||
return err;
|
||||
@@ -1068,7 +1067,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err) {
|
||||
kfree(cl);
|
||||
@@ -1373,7 +1373,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
|
||||
xstats.work = cl->cl_total;
|
||||
xstats.rtwork = cl->cl_cumul;
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
|
||||
return -1;
|
||||
|
@@ -1141,7 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
|
||||
cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
|
||||
cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &cl->bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
|
||||
return -1;
|
||||
@@ -1395,7 +1396,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
|
||||
if (htb_rate_est || tca[TCA_RATE]) {
|
||||
err = gen_new_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE] ? : &est.nla);
|
||||
if (err) {
|
||||
kfree(cl);
|
||||
@@ -1457,11 +1459,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
|
||||
parent->children++;
|
||||
} else {
|
||||
if (tca[TCA_RATE]) {
|
||||
spinlock_t *lock = qdisc_root_sleeping_lock(sch);
|
||||
|
||||
err = gen_replace_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
lock,
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err)
|
||||
return err;
|
||||
|
@@ -199,7 +199,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
|
||||
|
||||
sch = dev_queue->qdisc_sleeping;
|
||||
if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
|
||||
return -1;
|
||||
return 0;
|
||||
|
@@ -342,7 +342,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
* hold here is the look on dev_queue->qdisc_sleeping
|
||||
* also acquired below.
|
||||
*/
|
||||
spin_unlock_bh(d->lock);
|
||||
if (d->lock)
|
||||
spin_unlock_bh(d->lock);
|
||||
|
||||
for (i = tc.offset; i < tc.offset + tc.count; i++) {
|
||||
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
|
||||
@@ -359,15 +360,17 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
spin_unlock_bh(qdisc_lock(qdisc));
|
||||
}
|
||||
/* Reclaim root sleeping lock before completing stats */
|
||||
spin_lock_bh(d->lock);
|
||||
if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 ||
|
||||
if (d->lock)
|
||||
spin_lock_bh(d->lock);
|
||||
if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
|
||||
return -1;
|
||||
} else {
|
||||
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
|
||||
|
||||
sch = dev_queue->qdisc_sleeping;
|
||||
if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &sch->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL,
|
||||
&sch->qstats, sch->q.qlen) < 0)
|
||||
return -1;
|
||||
|
@@ -356,7 +356,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct Qdisc *cl_q;
|
||||
|
||||
cl_q = q->queues[cl - 1];
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &cl_q->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
|
||||
return -1;
|
||||
|
||||
|
@@ -319,7 +319,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct Qdisc *cl_q;
|
||||
|
||||
cl_q = q->queues[cl - 1];
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &cl_q->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
|
||||
return -1;
|
||||
|
||||
|
@@ -460,7 +460,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_replace_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err)
|
||||
return err;
|
||||
@@ -486,7 +487,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_new_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
NULL,
|
||||
qdisc_root_sleeping_running(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err)
|
||||
goto destroy_class;
|
||||
@@ -663,7 +665,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
|
||||
xstats.weight = cl->agg->class_weight;
|
||||
xstats.lmax = cl->agg->lmax;
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
|
||||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
|
||||
d, NULL, &cl->bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL,
|
||||
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
|
||||
|
Reference in New Issue
Block a user