From 4daeb940da70a827097033454ea72934ea046320 Mon Sep 17 00:00:00 2001
From: wanghonghao <wanghonghao@bytedance.com>
Date: Thu, 11 Apr 2024 11:56:25 +0800
Subject: [PATCH] io_u: make rate-submit in inline mode even

In `inline` submit mode, threads wait until all previous IOs finish upon
reaching rate limits, which may exceed the expected wait time and result
in batch or uneven submissions.

Handle completion events in usec_sleep() when necessary and no longer call
io_u_quiesce() before sleeping in rate_ddir() to achieve smoother fix-rate
submission as in `offload` mode.

Signed-off-by: wanghonghao <wanghonghao@bytedance.com>
---
 backend.c     | 12 ++++++------
 io_u.c        | 22 +++++++---------------
 io_u.h        |  2 +-
 rate-submit.c |  4 ++--
 time.c        | 21 +++++++++++++++++++++
 5 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/backend.c b/backend.c
index fb7dc68a92..34d5335c1e 100644
--- a/backend.c
+++ b/backend.c
@@ -233,7 +233,7 @@ static void cleanup_pending_aio(struct thread_data *td)
 	/*
 	 * get immediately available events, if any
 	 */
-	r = io_u_queued_complete(td, 0);
+	r = io_u_queued_complete(td, 0, NULL);
 
 	/*
 	 * now cancel remaining active events
@@ -252,7 +252,7 @@ static void cleanup_pending_aio(struct thread_data *td)
 	}
 
 	if (td->cur_depth)
-		r = io_u_queued_complete(td, td->cur_depth);
+		r = io_u_queued_complete(td, td->cur_depth, NULL);
 }
 
 /*
@@ -281,7 +281,7 @@ static bool fio_io_sync(struct thread_data *td, struct fio_file *f)
 	switch (ret) {
 	case FIO_Q_QUEUED:
 		td_io_commit(td);
-		if (io_u_queued_complete(td, 1) < 0)
+		if (io_u_queued_complete(td, 1, NULL) < 0)
 			return true;
 		break;
 	case FIO_Q_COMPLETED:
@@ -433,7 +433,7 @@ static int wait_for_completions(struct thread_data *td, struct timespec *time)
 		fio_gettime(time, NULL);
 
 	do {
-		ret = io_u_queued_complete(td, min_evts);
+		ret = io_u_queued_complete(td, min_evts, NULL);
 		if (ret < 0)
 			break;
 	} while (full && (td->cur_depth > td->o.iodepth_low));
@@ -753,7 +753,7 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes)
 		min_events = td->cur_depth;
 
 		if (min_events)
-			ret = io_u_queued_complete(td, min_events);
+			ret = io_u_queued_complete(td, min_events, NULL);
 	} else
 		cleanup_pending_aio(td);
 
@@ -1175,7 +1175,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
 			i = td->cur_depth;
 
 		if (i) {
-			ret = io_u_queued_complete(td, i);
+			ret = io_u_queued_complete(td, i, NULL);
 			if (td->o.fill_device &&
 			    (td->error == ENOSPC || td->error == EDQUOT))
 				td->error = 0;
diff --git a/io_u.c b/io_u.c
index 09e5f15a8b..541a709fbc 100644
--- a/io_u.c
+++ b/io_u.c
@@ -653,7 +653,7 @@ int io_u_quiesce(struct thread_data *td)
 		td_io_commit(td);
 
 	while (td->io_u_in_flight) {
-		ret = io_u_queued_complete(td, 1);
+		ret = io_u_queued_complete(td, 1, NULL);
 		if (ret > 0)
 			completed += ret;
 		else if (ret < 0)
@@ -709,24 +709,17 @@ static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
 	} else
 		usec = td->rate_next_io_time[ddir] - now;
 
-	if (td->o.io_submit_mode == IO_MODE_INLINE)
-		io_u_quiesce(td);
-
 	if (td->o.timeout && ((usec + now) > td->o.timeout)) {
 		/*
 		 * check if the usec is capable of taking negative values
 		 */
-		if (now > td->o.timeout) {
-			ddir = DDIR_TIMEOUT;
-			return ddir;
+		if (now < td->o.timeout) {
+			usec_sleep(td, td->o.timeout - now);
 		}
-		usec = td->o.timeout - now;
-	}
-	usec_sleep(td, usec);
-
-	now = utime_since_now(&td->epoch);
-	if ((td->o.timeout && (now > td->o.timeout)) || td->terminate)
 		ddir = DDIR_TIMEOUT;
+	} else {
+		usec_sleep(td, usec);
+	}
 
 	return ddir;
 }
@@ -2251,10 +2244,9 @@ int io_u_sync_complete(struct thread_data *td, struct io_u *io_u)
 /*
  * Called to complete min_events number of io for the async engines.
  */
-int io_u_queued_complete(struct thread_data *td, int min_evts)
+int io_u_queued_complete(struct thread_data *td, int min_evts, struct timespec *tvp)
 {
 	struct io_completion_data icd;
-	struct timespec *tvp = NULL;
 	int ret;
 	struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
 
diff --git a/io_u.h b/io_u.h
index ab93d50f96..3f49198a01 100644
--- a/io_u.h
+++ b/io_u.h
@@ -155,7 +155,7 @@ extern void put_io_u(struct thread_data *, struct io_u *);
 extern void clear_io_u(struct thread_data *, struct io_u *);
 extern void requeue_io_u(struct thread_data *, struct io_u **);
 extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *);
-extern int __must_check io_u_queued_complete(struct thread_data *, int);
+extern int __must_check io_u_queued_complete(struct thread_data *, int, struct timespec *);
 extern void io_u_queued(struct thread_data *, struct io_u *);
 extern int io_u_quiesce(struct thread_data *);
 extern void io_u_log_error(struct thread_data *, struct io_u *);
diff --git a/rate-submit.c b/rate-submit.c
index 92be3df75e..74500d6e8c 100644
--- a/rate-submit.c
+++ b/rate-submit.c
@@ -81,7 +81,7 @@ static int io_workqueue_fn(struct submit_worker *sw,
 		ret = td_io_queue(td, io_u);
 		if (ret != FIO_Q_BUSY)
 			break;
-		ret = io_u_queued_complete(td, 1);
+		ret = io_u_queued_complete(td, 1, NULL);
 		if (ret > 0)
 			td->cur_depth -= ret;
 		else if (ret < 0)
@@ -103,7 +103,7 @@ static int io_workqueue_fn(struct submit_worker *sw,
 		else
 			min_evts = 0;
 
-		ret = io_u_queued_complete(td, min_evts);
+		ret = io_u_queued_complete(td, min_evts, NULL);
 		if (ret > 0)
 			td->cur_depth -= ret;
 	}
diff --git a/time.c b/time.c
index 7f85c8de3b..bb82e43b4b 100644
--- a/time.c
+++ b/time.c
@@ -55,6 +55,27 @@ uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
 	struct timespec tv;
 	uint64_t t = 0;
 
+	if (td->o.io_submit_mode == IO_MODE_INLINE) {
+		struct timespec ts;
+		int err = 0;
+
+		fio_gettime(&tv, NULL);
+		if (td->io_u_queued || td->cur_depth)
+			td_io_commit(td);
+
+		while ((t = utime_since_now(&tv)) < usec &&
+			   td->io_u_in_flight && err == 0) {
+			ts.tv_sec = (usec - t) / 1000000;
+			ts.tv_nsec = (usec - t) % 1000000 * 1000;
+			err = io_u_queued_complete(td, 1, &ts);
+		}
+
+		if (td->flags & TD_F_REGROW_LOGS)
+			regrow_logs(td);
+
+		usec = t < usec ? usec - t : 0;
+	}
+
 	do {
 		unsigned long ts = usec;