From 506e931f92defdc60c1dc4aa2ff4a19a5dcd8618 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 7 May 2014 10:26:44 -0600
Subject: [PATCH] blk-mq: add basic round-robin of what CPU to queue workqueue
 work on

Right now we just pick the first CPU in the mask, but that can
easily overload that one. Add some basic batching and round-robin
all the entries in the mask instead.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 45 +++++++++++++++++++++++++++++-------------
 include/linux/blk-mq.h |  4 ++++
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0d379830a27..2410e0cb7ae 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -670,6 +670,30 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	}
 }
 
+/*
+ * It'd be great if the workqueue API had a way to pass
+ * in a mask and had some smarts for more clever placement.
+ * For now we just round-robin here, switching for every
+ * BLK_MQ_CPU_WORK_BATCH queued items.
+ */
+static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
+{
+	int cpu = hctx->next_cpu;
+
+	if (--hctx->next_cpu_batch <= 0) {
+		int next_cpu;
+
+		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+		if (next_cpu >= nr_cpu_ids)
+			next_cpu = cpumask_first(hctx->cpumask);
+
+		hctx->next_cpu = next_cpu;
+		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+	}
+
+	return cpu;
+}
+
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
@@ -682,13 +706,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	else {
 		unsigned int cpu;
 
-		/*
-		 * It'd be great if the workqueue API had a way to pass
-		 * in a mask and had some smarts for more clever placement
-		 * than the first CPU. Or we could round-robin here. For now,
-		 * just queue on the first CPU.
-		 */
-		cpu = cpumask_first(hctx->cpumask);
+		cpu = blk_mq_hctx_next_cpu(hctx);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
 	}
 }
@@ -795,13 +813,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 	else {
 		unsigned int cpu;
 
-		/*
-		 * It'd be great if the workqueue API had a way to pass
-		 * in a mask and had some smarts for more clever placement
-		 * than the first CPU. Or we could round-robin here. For now,
-		 * just queue on the first CPU.
-		 */
-		cpu = cpumask_first(hctx->cpumask);
+		cpu = blk_mq_hctx_next_cpu(hctx);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
 	}
 }
@@ -1378,6 +1390,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->next_cpu = cpumask_first(hctx->cpumask);
+		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+	}
 }
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3b561d651a0..5bd677e2dcb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -21,6 +21,8 @@ struct blk_mq_hw_ctx {
 	struct delayed_work	run_work;
 	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
+	int			next_cpu;
+	int			next_cpu_batch;
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
@@ -126,6 +128,8 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 
 	BLK_MQ_MAX_DEPTH	= 2048,
+
+	BLK_MQ_CPU_WORK_BATCH	= 8,
 };
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
-- 
2.46.0