From 81adb591f4c389c63a5bce1320af4d12fb83a155 Mon Sep 17 00:00:00 2001
From: Andrew Gooding <andy@aerospike.com>
Date: Wed, 10 Feb 2021 11:04:02 -0800
Subject: [PATCH 01/10] Comment tweak (trigger build).

---
 cf/include/log.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cf/include/log.h b/cf/include/log.h
index d868e7c9..e2522974 100644
--- a/cf/include/log.h
+++ b/cf/include/log.h
@@ -1,7 +1,7 @@
 /*
  * log.h
  *
- * Copyright (C) 2019 Aerospike, Inc.
+ * Copyright (C) 2019-2021 Aerospike, Inc.
  *
  * Portions may be licensed to Aerospike, Inc. under one or more contributor
  * license agreements.

From ef0fc63d863111ee8d6019efa00d4b73af845872 Mon Sep 17 00:00:00 2001
From: Andrew Gooding <andy@aerospike.com>
Date: Mon, 22 Feb 2021 11:04:23 -0800
Subject: [PATCH 02/10] AER-6396 - increment 'dup_res_ask' stat before
 callback, which may change destination count for replication.

---
 as/src/transaction/duplicate_resolve.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/as/src/transaction/duplicate_resolve.c b/as/src/transaction/duplicate_resolve.c
index 3e28b906..6a882a3d 100644
--- a/as/src/transaction/duplicate_resolve.c
+++ b/as/src/transaction/duplicate_resolve.c
@@ -393,14 +393,14 @@ dup_res_handle_ack(cf_node node, msg* m)
 
 	dup_res_translate_result_code(rw);
 
+	cf_atomic64_add(&rw->rsv.ns->n_dup_res_ask, rw->n_dest_nodes);
+
 	bool delete_from_hash = rw->dup_res_cb(rw);
 
 	rw->dup_res_complete = true;
 
 	cf_mutex_unlock(&rw->lock);
 
-	cf_atomic64_add(&rw->rsv.ns->n_dup_res_ask, rw->n_dest_nodes);
-
 	if (delete_from_hash) {
 		rw_request_hash_delete(&hkey, rw);
 	}

From 410d8e135ad9c6603ec51c0efe9d7e9a864b3b75 Mon Sep 17 00:00:00 2001
From: Andrew Gooding <andy@aerospike.com>
Date: Tue, 23 Feb 2021 13:38:43 -0800
Subject: [PATCH 03/10] AER-6397 - when assigning AP partition versions, don't
 put identical subsets with no full parent in the same family.

---
 as/src/fabric/partition_balance.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/as/src/fabric/partition_balance.c b/as/src/fabric/partition_balance.c
index 683a8ed2..a6266a53 100644
--- a/as/src/fabric/partition_balance.c
+++ b/as/src/fabric/partition_balance.c
@@ -1350,8 +1350,11 @@ find_family(const as_partition_version* self_version, uint32_t n_families,
 		const as_partition_version family_versions[])
 {
 	for (uint32_t n = 0; n < n_families; n++) {
-		if (is_family_same(self_version, &family_versions[n])) {
-			return n;
+		const as_partition_version* version_n = &family_versions[n];
+
+		if (is_family_same(self_version, version_n)) {
+			// Identical subsets with no full parent can't share family.
+			return version_n->subset == 0 ? n : VERSION_FAMILY_UNIQUE;
 		}
 	}
 

From ff60d580d1aaabb8af8c1fe420e250eae41c5fdc Mon Sep 17 00:00:00 2001
From: Andrew Gooding <andy@aerospike.com>
Date: Wed, 3 Mar 2021 10:00:47 -0800
Subject: [PATCH 04/10] AER-6402 - skip inactive sindex correctly when
 iterating through sindexes to adjust a list sindex.

---
 as/src/base/secondary_index.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/as/src/base/secondary_index.c b/as/src/base/secondary_index.c
index 0bd6c6f2..aa8026f1 100644
--- a/as/src/base/secondary_index.c
+++ b/as/src/base/secondary_index.c
@@ -3619,7 +3619,6 @@ as_sindex_sbins_list_diff_populate(as_sindex_bin *sbins, as_namespace *ns, const
 		as_sindex *si = &ns->sindex[simatch];
 
 		if (! as_sindex_isactive(si)) {
-			ele = ele->next;
 			continue;
 		}
 

From 77dd1cc84774a0cbd41082e5f47181485cbd1a49 Mon Sep 17 00:00:00 2001
From: Andrew Gooding <andy@aerospike.com>
Date: Wed, 3 Mar 2021 22:16:58 -0800
Subject: [PATCH 05/10] AER-6403 - before swapping as_msg_op lut, make sure it
 is within the client message allocation.

---
 as/src/base/batch.c       | 7 +++++++
 as/src/base/transaction.c | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/as/src/base/batch.c b/as/src/base/batch.c
index 209cb60d..ff749fc3 100644
--- a/as/src/base/batch.c
+++ b/as/src/base/batch.c
@@ -1024,7 +1024,14 @@ as_batch_queue_task(as_transaction* btr)
 					if (data + sizeof(as_msg_op) > limit) {
 						goto TranEnd;
 					}
+
 					op = (as_msg_op*)data;
+
+					// Swap can touch metadata bytes beyond as_msg_op struct.
+					if (as_msg_op_get_value_p(op) > limit) {
+						goto TranEnd;
+					}
+
 					as_msg_swap_op(op);
 					op = as_msg_op_get_next(op);
 					data = (uint8_t*)op;
diff --git a/as/src/base/transaction.c b/as/src/base/transaction.c
index 590df6f4..e3561d61 100644
--- a/as/src/base/transaction.c
+++ b/as/src/base/transaction.c
@@ -276,6 +276,12 @@ as_transaction_prepare(as_transaction *tr, bool swap)
 		as_msg_op* op = (as_msg_op*)p_read;
 
 		if (swap) {
+			// Swap can touch metadata bytes beyond as_msg_op struct.
+			if (as_msg_op_get_value_p(op) > p_end) {
+				cf_warning(AS_PROTO, "bad as_msg_op");
+				return false;
+			}
+
 			as_msg_swap_op(op);
 		}
 

From 393d538bbfc557744d98a3c0ee4a3e0e3af4b8ec Mon Sep 17 00:00:00 2001
From: Andrew Gooding <andy@aerospike.com>
Date: Tue, 9 Mar 2021 13:53:52 -0800
Subject: [PATCH 06/10] AER-6404 - for single-bin data-in-memory, don't copy
 uninitialized or undefined stack bin when generating a tombstone record.

---
 as/src/base/record.c     | 7 ++++++-
 as/src/storage/drv_ssd.c | 8 +++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/as/src/base/record.c b/as/src/base/record.c
index 84bb1984..31d39d00 100644
--- a/as/src/base/record.c
+++ b/as/src/base/record.c
@@ -520,7 +520,12 @@ record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd)
 	as_bin_destroy_all(rd->bins, rd->n_bins);
 
 	// Move the new bin into the index.
-	as_single_bin_copy(rd->bins, &new_bin);
+	if (n_new_bins == 1) {
+		as_single_bin_copy(rd->bins, &new_bin);
+	}
+	else {
+		as_bin_set_empty(rd->bins);
+	}
 
 	rd->n_bins = n_new_bins;
 	rd->bins = &new_bin;
diff --git a/as/src/storage/drv_ssd.c b/as/src/storage/drv_ssd.c
index 8ac05b2b..6da5830b 100644
--- a/as/src/storage/drv_ssd.c
+++ b/as/src/storage/drv_ssd.c
@@ -2452,7 +2452,13 @@ ssd_cold_start_add_record(drv_ssds* ssds, drv_ssd* ssd,
 
 		if (ns->single_bin) {
 			as_bin_destroy_all(old_bins, n_old_bins);
-			as_single_bin_copy(as_index_get_single_bin(r), rd.bins);
+
+			if (rd.n_bins == 1) {
+				as_single_bin_copy(as_index_get_single_bin(r), rd.bins);
+			}
+			else {
+				as_bin_set_empty(as_index_get_single_bin(r));
+			}
 		}
 		else {
 			// Success - adjust sindex, looking at old and new bins.

From e55ef7a7fbd940f6d92e0a19612d86d4ef9a0b59 Mon Sep 17 00:00:00 2001
From: Andrew Gooding <andy@aerospike.com>
Date: Tue, 9 Mar 2021 15:15:37 -0800
Subject: [PATCH 07/10] OPS-4486 - dump as much of the corrupt map header (and
 map) as possible to log before asserting.

---
 as/src/base/particle_map.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/as/src/base/particle_map.c b/as/src/base/particle_map.c
index 450258f0..9afe84ad 100644
--- a/as/src/base/particle_map.c
+++ b/as/src/base/particle_map.c
@@ -685,6 +685,12 @@ map_wire_size(const as_particle *p)
 	packed_map map;
 
 	if (! packed_map_init_from_particle(&map, p, false)) {
+		as_bin b = {
+				.particle = (as_particle *)p
+		};
+
+		as_bin_state_set_from_type(&b, AS_PARTICLE_TYPE_MAP);
+		cdt_bin_print(&b, "map");
 		cf_crash(AS_PARTICLE, "map_wire_size() invalid packed map");
 	}
 
@@ -958,10 +964,11 @@ map_flat_size(const as_particle *p)
 	packed_map map;
 
 	if (! packed_map_init_from_particle(&map, p, false)) {
-		const as_bin b = {
+		as_bin b = {
 				.particle = (as_particle *)p
 		};
 
+		as_bin_state_set_from_type(&b, AS_PARTICLE_TYPE_MAP);
 		cdt_bin_print(&b, "map");
 		cf_crash(AS_PARTICLE, "map_flat_size() invalid packed map");
 	}

From 6aebfa0544a3c6e1d2612e6693f0a405429db974 Mon Sep 17 00:00:00 2001
From: Kazuhiro HIWADA <kazuhiro.hiwada@kioxia.com>
Date: Fri, 22 Jul 2022 21:49:53 +0900
Subject: [PATCH 08/10] add submodules

---
 .gitmodules     | 8 +++++++-
 modules/lthread | 1 +
 modules/spdk    | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)
 create mode 160000 modules/lthread
 create mode 160000 modules/spdk

diff --git a/.gitmodules b/.gitmodules
index 502d027a..330f46d3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,7 +7,7 @@
 	url = https://github.com/aerospike/aerospike-mod-lua.git
 [submodule "modules/jansson"]
 	path = modules/jansson
-	url = https://github.com/aerospike/jansson.git
+	url = http://github.com/akheron/jansson
 	ignore = dirty
 [submodule "modules/luajit"]
 	path = modules/luajit
@@ -22,3 +22,9 @@
 	path = modules/jemalloc
 	url = https://github.com/aerospike/jemalloc.git
 	ignore = dirty
+[submodule "modules/spdk"]
+	path = modules/spdk
+	url = http://github.com/spdk/spdk
+[submodule "modules/lthread"]
+	path = modules/lthread
+	url = http://github.com/mita/lthread
diff --git a/modules/lthread b/modules/lthread
new file mode 160000
index 00000000..0386ae3a
--- /dev/null
+++ b/modules/lthread
@@ -0,0 +1 @@
+Subproject commit 0386ae3a8cdd8543ce16eecf42214b8f2f17b191
diff --git a/modules/spdk b/modules/spdk
new file mode 160000
index 00000000..24605155
--- /dev/null
+++ b/modules/spdk
@@ -0,0 +1 @@
+Subproject commit 24605155098bb2652cbe5dbc1301da8b4e62ca69

From a5a9b0e88adb608e069345904073a7e6f8f0cbad Mon Sep 17 00:00:00 2001
From: Kazuhiro HIWADA <kazuhiro.hiwada@kioxia.com>
Date: Fri, 22 Jul 2022 21:51:37 +0900
Subject: [PATCH 09/10] update submodule version

---
 .gitmodules     | 6 +++---
 modules/jansson | 2 +-
 modules/spdk    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 330f46d3..fbd06ed1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,7 +7,7 @@
 	url = https://github.com/aerospike/aerospike-mod-lua.git
 [submodule "modules/jansson"]
 	path = modules/jansson
-	url = http://github.com/akheron/jansson
+	url = https://github.com/akheron/jansson
 	ignore = dirty
 [submodule "modules/luajit"]
 	path = modules/luajit
@@ -24,7 +24,7 @@
 	ignore = dirty
 [submodule "modules/spdk"]
 	path = modules/spdk
-	url = http://github.com/spdk/spdk
+	url = https://github.com/spdk/spdk
 [submodule "modules/lthread"]
 	path = modules/lthread
-	url = http://github.com/mita/lthread
+	url = https://github.com/mita/lthread
diff --git a/modules/jansson b/modules/jansson
index e9ebfa7e..684e18c9 160000
--- a/modules/jansson
+++ b/modules/jansson
@@ -1 +1 @@
-Subproject commit e9ebfa7e77a6bee77df44e096b100e7131044059
+Subproject commit 684e18c927e89615c2d501737e90018f4930d6c5
diff --git a/modules/spdk b/modules/spdk
index 24605155..1f0dd58a 160000
--- a/modules/spdk
+++ b/modules/spdk
@@ -1 +1 @@
-Subproject commit 24605155098bb2652cbe5dbc1301da8b4e62ca69
+Subproject commit 1f0dd58a43b5bc8118b123eca1b07781b052293d

From f1e10fe544068888d6c61d83bacec16d0b689975 Mon Sep 17 00:00:00 2001
From: Kazuhiro HIWADA <kazuhiro.hiwada@kioxia.com>
Date: Fri, 22 Jul 2022 22:18:14 +0900
Subject: [PATCH 10/10] add the areospike-spdk patch and README

---
 README-spdk.md       |   59 ++
 aerospike-spdk.patch | 2358 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2417 insertions(+)
 create mode 100755 README-spdk.md
 create mode 100755 aerospike-spdk.patch

diff --git a/README-spdk.md b/README-spdk.md
new file mode 100755
index 00000000..3421bafc
--- /dev/null
+++ b/README-spdk.md
@@ -0,0 +1,59 @@
+# How to build Aerospike with SPDK patch
+
+The build process has been tested on Ubuntu 20.04
+
+1. Install dependencies to build SPDK
+
+```
+git submodule update --init --recursive
+sudo ./modules/spdk/scripts/pkgdep.sh
+```
+
+2. Apply Aerospike SPDK patch
+
+```
+patch -p1 < aerospike-spdk.patch
+```
+
+3. Build Aerospike with SPDK
+
+```
+make USE_SPDK=1 USE_LTHREAD=1
+```
+
+# How to setup Aerospike with SPDK
+
+1. Create a SPDK config file for your NVMe device
+
+```
+sudo env HUGEMEM=20480 modules/spdk/scripts/setup.sh
+sudo mkdir -p /usr/local/etc/spdk/
+modules/spdk/scripts/gen_nvme.sh  --json-with-subsystems | sudo tee /usr/local/etc/spdk/aerospike.conf
+```
+
+2. Update a Aerospike config file
+
+The developer configuration file, `aerospike_dev.conf`, contains basic settings.
+You may want to change some parameters in the file.
+
+For example, the `service-lcores` parameter specifies a list of CPU which the lightweight threads run on.
+The `device` parameter specifies a storage device such as the SPDK block device (bdev).
+
+# How to start Aerospike with SPDK
+
+1. setup SPDK
+
+This script only needs to be run once the system is up.
+
+```
+sudo env HUGEMEM=20480 modules/spdk/scripts/setup.sh
+```
+
+2. start Aerospike server
+
+```
+sudo su
+make init
+ulimit -n 200000
+make start
+```
diff --git a/aerospike-spdk.patch b/aerospike-spdk.patch
new file mode 100755
index 00000000..bb511015
--- /dev/null
+++ b/aerospike-spdk.patch
@@ -0,0 +1,2358 @@
+diff --git a/Makefile b/Makefile
+index 1493584..91f3833 100644
+--- a/Makefile
++++ b/Makefile
+@@ -49,9 +49,15 @@ lib: aslibs
+ 	$(MAKE) -C as $@ STATIC_LIB=1
+ 
+ .PHONY: aslibs
+-aslibs: targetdirs version $(JANSSON)/Makefile $(JEMALLOC)/Makefile $(LUAJIT)/src/luaconf.h
++aslibs: targetdirs version $(SPDK)/include/spdk/config.h $(JANSSON)/Makefile $(JEMALLOC)/Makefile $(LUAJIT)/src/luaconf.h
+ ifeq ($(USE_LUAJIT),1)
+ 	$(MAKE) -C $(LUAJIT) Q= TARGET_SONAME=libluajit.so CCDEBUG=-g
++endif
++ifeq ($(USE_SPDK),1)
++	$(MAKE) -C $(SPDK) CONFIG_RTE_LIBRTE_TIMER=y
++endif
++ifeq ($(USE_LTHREAD),1)
++	$(MAKE) -C $(LTHREAD) DPDK=$(SPDK)/dpdk/build EXTRA_CFLAGS=-DLTHREAD_MAX_STACK_SIZE=1048576
+ endif
+ 	$(MAKE) -C $(JEMALLOC)
+ 	$(MAKE) -C $(JANSSON)
+@@ -96,6 +102,10 @@ cleanbasic:
+ .PHONY: cleanmodules
+ cleanmodules:
+ 	$(MAKE) -C $(COMMON) clean
++	if [ -e "$(SPDK)/include/spdk/config.h" ]; then \
++		$(MAKE) -C $(SPDK) clean; \
++	fi
++	$(MAKE) -C $(LTHREAD) DPDK=$(SPDK)/dpdk/build clean
+ 	if [ -e "$(JANSSON)/Makefile" ]; then \
+ 		$(MAKE) -C $(JANSSON) clean; \
+ 		$(MAKE) -C $(JANSSON) distclean; \
+@@ -126,6 +136,8 @@ GIT_CLEAN = git clean -fdx
+ .PHONY: cleangit
+ cleangit:
+ 	cd $(COMMON); $(GIT_CLEAN)
++	cd $(SPDK); $(GIT_CLEAN)
++	cd $(LTHREAD); $(GIT_CLEAN)
+ 	cd $(JANSSON); $(GIT_CLEAN)
+ 	cd $(JEMALLOC); $(GIT_CLEAN)
+ 	cd $(LUAJIT); $(GIT_CLEAN)
+@@ -146,6 +158,9 @@ $(VERSION_OBJ):	$(VERSION_SRC)
+ .PHONY: version
+ version:	$(VERSION_OBJ)
+ 
++$(SPDK)/include/spdk/config.h:
++	cd $(SPDK) && ./configure --without-isal
++
+ $(JANSSON)/configure:
+ 	cd $(JANSSON) && autoreconf -i
+ 
+diff --git a/as/etc/aerospike_dev.conf b/as/etc/aerospike_dev.conf
+index a1a6608..b5aacaa 100644
+--- a/as/etc/aerospike_dev.conf
++++ b/as/etc/aerospike_dev.conf
+@@ -7,10 +7,19 @@ service {
+ 	# The number of concurrent connections to the database is limited by
+ 	# proto-fd-max, and by the system's maximum number of open file descriptors.
+ 	# See "man limits.conf" for how to set the system's "nofile" limit.
+-	proto-fd-max 1024
++	proto-fd-max 200000
+ 
+ 	work-directory run/work
+ 	pidfile run/asd.pid
++
++	spdk-json-conf /usr/local/etc/spdk/aerospike.conf
++
++	service-threads 360
++	service-lcores [0-17]
++#	service-threads 160
++#	service-lcores 0-7
++
++	microsecond-histograms true
+ }
+ 
+ mod-lua {
+@@ -57,22 +66,44 @@ network {
+ 
+ namespace test {
+ 	replication-factor 2
+-	memory-size 4G
++	memory-size 100G
+ 
+ 	storage-engine memory
+ }
+ 
+ namespace bar {
+ 	replication-factor 2
+-	memory-size 4G
++	memory-size 100G
+ 
+-	storage-engine memory
++#	storage-engine memory
+ 
+ 	# To use file storage backing, comment out the line above and use the
+ 	# following lines instead.
+-#	storage-engine device {
++	storage-engine device {
+ #		file /opt/aerospike/data/bar.dat
+ #		filesize 16G
+ #		data-in-memory true # Store data in memory in addition to file.
+-#	}
++		cold-start-empty true
++
++		# RAM disk
++#		device /dev/ram0
++
++		# NVMe
++#		device /dev/nvme0n1
++
++		# SPDK bdev
++		device-backend spdk-bdev
++#		device Raid0
++		device Nvme0n1
++		device Nvme1n1
++		filesize 447G
++		enable-benchmarks-storage true
++		write-block-size 1M
++#		post-write-queue 8192
++		allow-batch-inline true
++		max-write-cache 4G
++		flush-max-ms 0
++		defrag-lwm-pct 50
++		defrag-sleep 0
++	}
+ }
+diff --git a/as/include/base/cfg.h b/as/include/base/cfg.h
+index c11190f..476d89a 100644
+--- a/as/include/base/cfg.h
++++ b/as/include/base/cfg.h
+@@ -60,7 +60,7 @@ struct as_namespace_s;
+ //
+ 
+ #ifndef AS_NAMESPACE_SZ
+-#define AS_NAMESPACE_SZ 2
++#define AS_NAMESPACE_SZ 32
+ #endif
+ 
+ #define NO_NS_IX AS_NAMESPACE_SZ
+@@ -138,6 +138,7 @@ typedef struct as_config_s {
+ 	uint32_t		scan_max_done; // maximum number of finished scans kept for monitoring
+ 	uint32_t		n_scan_threads_limit;
+ 	uint32_t		n_service_threads;
++	char*			service_lcores;
+ 	uint32_t		sindex_builder_threads; // secondary index builder thread pool size
+ 	uint32_t		sindex_gc_max_rate; // Max sindex entries processed per second for gc
+ 	uint32_t		sindex_gc_period; // same as nsup_period for sindex gc
+@@ -212,6 +213,8 @@ typedef struct as_config_s {
+ 	as_sec_config	sec_cfg;
+ 	as_xdr_config	xdr_cfg; // TODO - Forcing cfg.h to include xdr.h. Consider *.
+ 
++	char*			spdk_json_conf;
++
+ 	uint32_t		n_tls_specs;
+ 	cf_tls_spec		tls_specs[MAX_TLS_SPECS];
+ 
+diff --git a/as/include/base/datamodel.h b/as/include/base/datamodel.h
+index 305568a..69811dc 100644
+--- a/as/include/base/datamodel.h
++++ b/as/include/base/datamodel.h
+@@ -796,6 +796,9 @@ struct as_namespace_s {
+ 	uint32_t		storage_write_block_size;
+ 	bool			storage_data_in_memory;
+ 
++	char*			storage_device_backend;
++	bool			storage_recycle_fds;
++	bool			storage_allow_batch_inline;
+ 	bool			storage_cache_replica_writes;
+ 	bool			storage_cold_start_empty;
+ 	bool			storage_commit_to_device; // relevant only for enterprise edition
+diff --git a/as/include/base/service.h b/as/include/base/service.h
+index 3bc086c..08caa9c 100644
+--- a/as/include/base/service.h
++++ b/as/include/base/service.h
+@@ -29,6 +29,7 @@
+ #include <stdbool.h>
+ #include <stdint.h>
+ 
++#include "aerospike/as_monitor.h"
+ #include "citrusleaf/cf_digest.h"
+ 
+ #include "socket.h"
+@@ -89,3 +90,8 @@ as_service_enqueue_internal(struct as_transaction_s* tr)
+ {
+ 	as_service_enqueue_internal_raw(tr, NULL, 0, false);
+ }
++
++#ifdef USE_LTHREAD
++void as_service_run_threads(void);
++extern as_monitor as_service_run_monitor;
++#endif
+diff --git a/as/include/storage/drv_ssd.h b/as/include/storage/drv_ssd.h
+index fa66978..09e32c7 100644
+--- a/as/include/storage/drv_ssd.h
++++ b/as/include/storage/drv_ssd.h
+@@ -56,6 +56,7 @@ struct as_index_s;
+ struct as_namespace_s;
+ struct as_storage_rd_s;
+ struct drv_ssd_s;
++struct ssd_ops;
+ 
+ 
+ //==========================================================
+@@ -190,8 +191,30 @@ typedef struct drv_ssd_s {
+ 	histogram		*hist_large_block_read;
+ 	histogram		*hist_write;
+ 	histogram		*hist_shadow_write;
++
++	const struct ssd_ops	*ops;
++	void			*priv;
+ } drv_ssd;
+ 
++typedef union {
++	int fd;
++	void *handle;
++} ssd_fd_t;
++
++struct ssd_ops {
++	const char *name;
++	void (*init)(void);
++	void (*shutdown)(void);
++	bool (*init_device)(as_namespace *ns, drv_ssd *ssd, bool is_shadow);
++	void (*finish_device)(drv_ssd *ssd);
++	void *(*dma_alloc)(size_t sz);
++	void (*dma_free)(void *ptr);
++	ssd_fd_t (*open)(drv_ssd *ssd, bool is_shadow, int flags);
++	void (*close)(ssd_fd_t ssd_fd);
++	bool (*pread)(ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset, bool bounce);
++	bool (*pwrite)(ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset, bool bounce);
++};
++
+ 
+ //------------------------------------------------
+ // Per-namespace storage information.
+@@ -262,9 +285,6 @@ void ssd_cold_start_init_repl_state(struct as_namespace_s *ns, struct as_index_s
+ void ssd_cold_start_init_xdr_state(const struct as_flat_record_s* flat, struct as_index_s* r);
+ 
+ // Miscellaneous.
+-int ssd_fd_get(drv_ssd *ssd);
+-int ssd_shadow_fd_get(drv_ssd *ssd);
+-void ssd_fd_put(drv_ssd *ssd, int fd);
+ void ssd_header_init_cfg(const struct as_namespace_s *ns, drv_ssd* ssd, drv_header *header);
+ void ssd_header_validate_cfg(const struct as_namespace_s *ns, drv_ssd* ssd, const drv_header *header);
+ void ssd_flush_final_cfg(struct as_namespace_s *ns);
+diff --git a/as/include/storage/storage.h b/as/include/storage/storage.h
+index af1e0c7..51e3f66 100644
+--- a/as/include/storage/storage.h
++++ b/as/include/storage/storage.h
+@@ -68,8 +68,8 @@ typedef enum {
+ 
+ // Artificial limit on write-block-size, in case we ever move to an
+ // SSD_HEADER_SIZE that's too big to be a write-block size limit.
+-// MAX_WRITE_BLOCK_SIZE must be power of 2 and <= SSD_HEADER_SIZE.
+-#define MAX_WRITE_BLOCK_SIZE (8 * 1024 * 1024)
++// MAX_WRITE_BLOCK_SIZE must be power of 2
++#define MAX_WRITE_BLOCK_SIZE (128 * 1024 * 1024)
+ 
+ // Artificial limit on write-block-size, must be power of 2 and >= RBLOCK_SIZE.
+ #define MIN_WRITE_BLOCK_SIZE (1024 * 1)
+diff --git a/as/src/Makefile b/as/src/Makefile
+index 751c622..cdbf688 100644
+--- a/as/src/Makefile
++++ b/as/src/Makefile
+@@ -17,6 +17,18 @@ SYSTEMTAP_PROBES_H = $(GEN_DIR)/probes.h
+ SYSTEMTAP_PROBES_O = $(OBJECT_DIR)/probes.o
+ endif
+ 
++USE_SPDK = 0
++
++ifeq ($(USE_SPDK),1)
++CFLAGS +=	-DUSE_SPDK
++endif
++
++USE_LTHREAD = 0
++
++ifeq ($(USE_LTHREAD),1)
++CFLAGS +=	-DUSE_LTHREAD
++endif
++
+ ifeq ($(USE_EE),1)
+   include $(EEREPO)/as/make_in/Makefile.vars
+ endif
+@@ -133,6 +145,16 @@ else
+   endif
+ endif
+ 
++ifeq ($(USE_SPDK),1)
++  INCLUDES += -I$(SPDK)/include
++endif
++
++ifeq ($(USE_LTHREAD),1)
++  INCLUDES += -I$(SPDK)/dpdk/build/include
++  INCLUDES += -I$(LTHREAD)/
++  INCLUDES += -I$(LTHREAD)/arch/x86
++endif
++
+ AS_LIBRARIES += $(LIBRARY_DIR)/libcf.a
+ AS_LIBRARIES += $(LIBRARY_DIR)/libai.a
+ AS_LIBRARIES += $(COMMON)/target/$(PLATFORM)/lib/libaerospike-common.a
+@@ -171,6 +193,28 @@ else
+   LIBRARIES += -L$(JANSSON)/src/.libs -ljansson
+ endif
+ 
++ifeq ($(USE_SPDK),1)
++  SPDK_ROOT_DIR := $(SPDK)
++  OS = Linux
++  include $(SPDK_ROOT_DIR)/mk/config.mk
++  include $(CONFIG_ENV)/env.mk
++  include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
++  SPDK_LIB_LIST = $(filter-out sock_vpp,$(ALL_MODULES_LIST))
++  SPDK_LIB_LIST += bdev_ftl ftl bdev_aio bdev_virtio virtio
++  SPDK_LIB_LIST += thread util bdev conf accel rpc jsonrpc json log sock trace notify
++  SPDK_LIB_LIST += event event_bdev event_accel event_vmd
++  define spdk_lib_list_to_static_libs
++  $(1:%=$(SPDK_ROOT_DIR)/build/lib/libspdk_%.a)
++  endef
++  include $(SPDK_ROOT_DIR)/mk/spdk.app_vars.mk
++  LIBRARIES += -Wl,--whole-archive $(ENV_LIBS) $(SPDK)/dpdk/build/lib/librte_timer.a -Wl,--no-whole-archive
++  LIBRARIES += $(SPDK_LIB_LINKER_ARGS) $(ENV_LINKER_ARGS) -ldl -lrt -luuid -lcrypto -laio
++endif
++
++ifeq ($(USE_LTHREAD),1)
++  LIBRARIES += -L$(LTHREAD) -llthread
++endif
++
+ LIBRARIES += -L$(S2) -ls2 -ls2cellid -lgoogle-strings -lgoogle-base \
+ 			-lgoogle-util-coding -lgoogle-util-math \
+ 			$(shell curl-config --libs) -lstdc++
+diff --git a/as/src/base/batch.c b/as/src/base/batch.c
+index ff749fc..f6b0113 100644
+--- a/as/src/base/batch.c
++++ b/as/src/base/batch.c
+@@ -1059,7 +1059,7 @@ as_batch_queue_task(as_transaction* btr)
+ 		}
+ 
+ 		// Submit transaction.
+-		if (info != 0 && ns->storage_data_in_memory) {
++		if (info != 0 && (ns->storage_data_in_memory || ns->storage_allow_batch_inline)) {
+ 			as_tsvc_process_transaction(&tr);
+ 		}
+ 		else {
+diff --git a/as/src/base/cfg.c b/as/src/base/cfg.c
+index d4e579f..3cda0a5 100644
+--- a/as/src/base/cfg.c
++++ b/as/src/base/cfg.c
+@@ -293,9 +293,11 @@ typedef enum {
+ 	CASE_SERVICE_SCAN_MAX_DONE,
+ 	CASE_SERVICE_SCAN_THREADS_LIMIT,
+ 	CASE_SERVICE_SERVICE_THREADS,
++	CASE_SERVICE_SERVICE_LCORES,
+ 	CASE_SERVICE_SINDEX_BUILDER_THREADS,
+ 	CASE_SERVICE_SINDEX_GC_MAX_RATE,
+ 	CASE_SERVICE_SINDEX_GC_PERIOD,
++	CASE_SERVICE_SPDK_JSON_CONF,
+ 	CASE_SERVICE_STAY_QUIESCED,
+ 	CASE_SERVICE_TICKER_INTERVAL,
+ 	CASE_SERVICE_TRANSACTION_MAX_MS,
+@@ -569,6 +571,9 @@ typedef enum {
+ 	CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE,
+ 	CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY,
+ 	// Normally hidden:
++	CASE_NAMESPACE_STORAGE_DEVICE_BACKEND,
++	CASE_NAMESPACE_STORAGE_DEVICE_RECYCLE_FDS,
++	CASE_NAMESPACE_STORAGE_DEVICE_ALLOW_BATCH_INLINE,
+ 	CASE_NAMESPACE_STORAGE_DEVICE_CACHE_REPLICA_WRITES,
+ 	CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY,
+ 	CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE,
+@@ -801,9 +806,11 @@ const cfg_opt SERVICE_OPTS[] = {
+ 		{ "scan-max-done",					CASE_SERVICE_SCAN_MAX_DONE },
+ 		{ "scan-threads-limit",				CASE_SERVICE_SCAN_THREADS_LIMIT },
+ 		{ "service-threads",				CASE_SERVICE_SERVICE_THREADS },
++		{ "service-lcores",				CASE_SERVICE_SERVICE_LCORES },
+ 		{ "sindex-builder-threads",			CASE_SERVICE_SINDEX_BUILDER_THREADS },
+ 		{ "sindex-gc-max-rate",				CASE_SERVICE_SINDEX_GC_MAX_RATE },
+ 		{ "sindex-gc-period",				CASE_SERVICE_SINDEX_GC_PERIOD },
++		{ "spdk-json-conf",				CASE_SERVICE_SPDK_JSON_CONF },
+ 		{ "stay-quiesced",					CASE_SERVICE_STAY_QUIESCED },
+ 		{ "ticker-interval",				CASE_SERVICE_TICKER_INTERVAL },
+ 		{ "transaction-max-ms",				CASE_SERVICE_TRANSACTION_MAX_MS },
+@@ -1089,6 +1096,9 @@ const cfg_opt NAMESPACE_STORAGE_DEVICE_OPTS[] = {
+ 		{ "scheduler-mode",					CASE_NAMESPACE_STORAGE_DEVICE_SCHEDULER_MODE },
+ 		{ "write-block-size",				CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE },
+ 		{ "data-in-memory",					CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY },
++		{ "device-backend",				CASE_NAMESPACE_STORAGE_DEVICE_BACKEND },
++		{ "recycle-fds",				CASE_NAMESPACE_STORAGE_DEVICE_RECYCLE_FDS },
++		{ "allow-batch-inline",				CASE_NAMESPACE_STORAGE_DEVICE_ALLOW_BATCH_INLINE },
+ 		{ "cache-replica-writes",			CASE_NAMESPACE_STORAGE_DEVICE_CACHE_REPLICA_WRITES },
+ 		{ "cold-start-empty",				CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY },
+ 		{ "commit-to-device",				CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE },
+@@ -2322,6 +2332,9 @@ as_config_init(const char* config_file)
+ 			case CASE_SERVICE_SERVICE_THREADS:
+ 				c->n_service_threads = cfg_u32(&line, 1, MAX_SERVICE_THREADS);
+ 				break;
++			case CASE_SERVICE_SERVICE_LCORES:
++				c->service_lcores = cfg_strdup_no_checks(&line);
++				break;
+ 			case CASE_SERVICE_SINDEX_BUILDER_THREADS:
+ 				c->sindex_builder_threads = cfg_u32(&line, 1, MAX_SINDEX_BUILDER_THREADS);
+ 				break;
+@@ -2335,6 +2348,9 @@ as_config_init(const char* config_file)
+ 				cfg_enterprise_only(&line);
+ 				c->stay_quiesced = cfg_bool(&line);
+ 				break;
++			case CASE_SERVICE_SPDK_JSON_CONF:
++				c->spdk_json_conf = cfg_strdup_no_checks(&line);
++				break;
+ 			case CASE_SERVICE_TICKER_INTERVAL:
+ 				c->ticker_interval = cfg_u32_no_checks(&line);
+ 				break;
+@@ -3301,6 +3317,15 @@ as_config_init(const char* config_file)
+ 			case CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY:
+ 				ns->storage_data_in_memory = cfg_bool(&line);
+ 				break;
++			case CASE_NAMESPACE_STORAGE_DEVICE_BACKEND:
++				ns->storage_device_backend = cfg_strdup_no_checks(&line);
++				break;
++			case CASE_NAMESPACE_STORAGE_DEVICE_RECYCLE_FDS:
++				ns->storage_recycle_fds = cfg_bool(&line);
++				break;
++			case CASE_NAMESPACE_STORAGE_DEVICE_ALLOW_BATCH_INLINE:
++				ns->storage_allow_batch_inline = cfg_bool(&line);
++				break;
+ 			case CASE_NAMESPACE_STORAGE_DEVICE_CACHE_REPLICA_WRITES:
+ 				ns->storage_cache_replica_writes = cfg_bool(&line);
+ 				break;
+diff --git a/as/src/base/namespace.c b/as/src/base/namespace.c
+index 804242f..e663dda 100644
+--- a/as/src/base/namespace.c
++++ b/as/src/base/namespace.c
+@@ -142,6 +142,8 @@ as_namespace_create(char *name)
+ 	// Note - default true is consistent with AS_STORAGE_ENGINE_MEMORY, but
+ 	// cfg.c will set default false for AS_STORAGE_ENGINE_SSD.
+ 
++	ns->storage_device_backend = "posix";
++	ns->storage_recycle_fds = true;
+ 	ns->storage_scheduler_mode = NULL; // null indicates default is to not change scheduler mode
+ 	ns->storage_write_block_size = 1024 * 1024;
+ 	ns->storage_defrag_lwm_pct = 50; // defrag if occupancy of block is < 50%
+diff --git a/as/src/base/service.c b/as/src/base/service.c
+index e195058..0769db6 100644
+--- a/as/src/base/service.c
++++ b/as/src/base/service.c
+@@ -36,6 +36,16 @@
+ #include <sys/time.h>
+ #include <unistd.h>
+ #include <zlib.h>
++#ifdef USE_LTHREAD
++#include <rte_common.h>
++#include <rte_lcore.h>
++#include <rte_per_lcore.h>
++#include <rte_errno.h>
++#include <rte_timer.h>
++
++#include "lthread_api.h"
++#include "lthread_diag_api.h"
++#endif
+ 
+ #include "aerospike/as_atomic.h"
+ #include "citrusleaf/alloc.h"
+@@ -154,6 +164,162 @@ rearm(as_file_handle* fd_h, uint32_t events)
+ 			events | EPOLLONESHOT | EPOLLRDHUP, fd_h);
+ }
+ 
++#ifdef USE_LTHREAD
++
++static void
++create_service_lthread(struct lthread **lt, uint32_t sid)
++{
++	thread_ctx* ctx = cf_malloc(sizeof(thread_ctx));
++
++	cf_detail(AS_SERVICE, "starting sid %u ctx %p", sid, ctx);
++
++	if (as_config_is_cpu_pinned()) {
++		ctx->i_cpu = (cf_topo_cpu_index)(sid % cf_topo_count_cpus());
++	}
++
++	ctx->lock = &g_thread_locks[sid];
++	cf_poll_create(&ctx->poll);
++	cf_epoll_queue_init(&ctx->trans_q, AS_TRANSACTION_HEAD_SIZE, 64);
++
++	lthread_create(lt, -1, run_service, ctx);
++
++	cf_mutex_lock(&g_thread_locks[sid]);
++
++	g_thread_ctxs[sid] = ctx;
++
++	cf_mutex_unlock(&g_thread_locks[sid]);
++}
++
++struct init_data {
++	uint32_t sid;
++	uint32_t lcores;
++};
++
++static void *initial_lthread(void *args)
++{
++	struct init_data *data = args;
++	uint32_t n_threads = g_config.n_service_threads / data->lcores;
++	struct lthread *lt[n_threads];
++
++	for (uint32_t i = 0; i < n_threads; i++) {
++		create_service_lthread(&lt[i], data->sid + data->lcores * i);
++	}
++
++	for (uint32_t i = 0; i < n_threads; i++) {
++		lthread_join(lt[i], NULL);
++	}
++
++	lthread_scheduler_shutdown(rte_lcore_id());
++	lthread_detach();
++
++	return NULL;
++}
++
++struct sched_data {
++	cf_atomic32 id;
++	uint32_t lcores;
++};
++
++static int
++lthread_scheduler(void *args)
++{
++	struct sched_data *sched_data = args;
++	struct init_data init_data;
++	struct lthread *lt;
++
++	init_data.lcores = sched_data->lcores;
++	init_data.sid = (uint32_t)cf_atomic32_incr(&sched_data->id);
++
++	lthread_create(&lt, -1, initial_lthread, &init_data);
++	lthread_run();
++
++	return 0;
++}
++
++static void *run_lthreads(void *arg __attribute__((unused)))
++{
++	char *args[3];
++	int argc = 0;
++
++	args[argc++] = "asd_lthread";
++	if (g_config.service_lcores) {
++		args[argc++] = "--lcores";
++		args[argc++] = g_config.service_lcores;
++	}
++
++	int ret = rte_eal_init(argc, (char **)&args);
++
++	if (ret < 0) {
++		if (rte_errno == EALREADY) {
++			as_monitor_notify(&as_service_run_monitor);
++		} else {
++			cf_crash(AS_SERVICE, "Invalid EAL parameters");
++		}
++	} else {
++		as_service_run_threads();
++	}
++
++	return NULL;
++}
++
++as_monitor as_service_run_monitor;
++
++void
++as_service_run_threads(void)
++{
++	struct sched_data sched_data;
++
++	int ret = rte_timer_subsystem_init();
++	if (ret < 0)
++		cf_crash(AS_SERVICE, "Failed to initialize timer subsystem");
++
++	unsigned lcore_id;
++
++	cf_atomic32_set(&sched_data.id, UINT32_MAX);
++	sched_data.lcores = 0;
++
++	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
++		if (rte_lcore_is_enabled(lcore_id))
++			sched_data.lcores++;
++	}
++
++	if (g_config.n_service_threads % sched_data.lcores) {
++		cf_crash_nostack(AS_SERVICE, "'service-threads' must be a multiple of the number of lcores (%u)",
++				sched_data.lcores);
++	}
++
++	lthread_num_schedulers_set((int)sched_data.lcores);
++	rte_eal_mp_remote_launch(lthread_scheduler, &sched_data, CALL_MAIN);
++
++	RTE_LCORE_FOREACH_WORKER(lcore_id) {
++		rte_eal_wait_lcore(lcore_id);
++	}
++}
++
++static void create_service_threads(void)
++{
++	cf_thread_create_detached(run_lthreads, NULL);
++}
++
++static void service_yield(void)
++{
++	lthread_yield();
++}
++
++#else
++
++static void create_service_threads(void)
++{
++	for (uint32_t i = 0; i < g_config.n_service_threads; i++) {
++		create_service_thread(i);
++	}
++}
++
++static void service_yield(void)
++{
++}
++
++#endif
+ 
+ //==========================================================
+ // Public API.
+@@ -171,9 +337,7 @@ as_service_init(void)
+ 		cf_mutex_init(&g_thread_locks[i]);
+ 	}
+ 
+-	for (uint32_t i = 0; i < g_config.n_service_threads; i++) {
+-		create_service_thread(i);
+-	}
++	create_service_threads();
+ }
+ 
+ void
+@@ -613,11 +777,21 @@ run_service(void* udata)
+ 	as_xdr_init_poll(poll);
+ 
+ 	while (true) {
++#ifdef USE_LTHREAD
++		int timeout = 0;
++#else
++		int timeout = -1;
++#endif
+ 		cf_poll_event events[N_EVENTS];
+-		int32_t n_events = cf_poll_wait(poll, events, N_EVENTS, -1);
++		int32_t n_events = cf_poll_wait(poll, events, N_EVENTS, timeout);
+ 
+ 		cf_assert(n_events >= 0, AS_SERVICE, "unexpected EINTR");
+ 
++		if (n_events == 0) {
++			service_yield();
++			continue;
++		}
++
+ 		for (uint32_t i = 0; i < (uint32_t)n_events; i++) {
+ 			uint32_t mask = events[i].events;
+ 			void* data = events[i].data;
+@@ -693,6 +867,8 @@ run_service(void* udata)
+ 			// the transaction. We'll rearm at the end of the transaction.
+ 			start_transaction(fd_h);
+ 		}
++
++		service_yield();
+ 	}
+ 
+ 	return NULL;
+diff --git a/as/src/base/thr_info.c b/as/src/base/thr_info.c
+index 5b03580..053bd27 100644
+--- a/as/src/base/thr_info.c
++++ b/as/src/base/thr_info.c
+@@ -5360,8 +5360,13 @@ info_get_sindexes(char *name, cf_dyn_buf *db)
+ }
+ 
+ static int32_t
+-oldest_nvme_age(const char *path)
++oldest_nvme_age(as_namespace *ns, const char *path)
+ {
++	if (strcmp(ns->storage_device_backend, "posix")) {
++		cf_detail(AS_INFO, "device info is not supported by %s: %s", ns->storage_device_backend, path);
++		return -1;
++	}
++
+ 	cf_storage_device_info *info = cf_storage_get_device_info(path);
+ 
+ 	if (info == NULL) {
+@@ -5384,7 +5389,7 @@ add_index_device_stats(as_namespace *ns, cf_dyn_buf *db)
+ {
+ 	for (uint32_t i = 0; i < ns->n_xmem_mounts; i++) {
+ 		info_append_indexed_int(db, "index-type.mount", i, "age",
+-				oldest_nvme_age(ns->xmem_mounts[i]));
++				oldest_nvme_age(ns, ns->xmem_mounts[i]));
+ 	}
+ }
+ 
+@@ -5412,7 +5417,7 @@ add_data_device_stats(as_namespace *ns, cf_dyn_buf *db)
+ 		info_append_indexed_uint32(db, tag, i, "shadow_write_q", stats.shadow_write_q_sz);
+ 
+ 		info_append_indexed_int(db, tag, i, "age",
+-				oldest_nvme_age(ns->storage_devices[i]));
++				oldest_nvme_age(ns, ns->storage_devices[i]));
+ 	}
+ }
+ 
+diff --git a/as/src/storage/drv_ssd.c b/as/src/storage/drv_ssd.c
+index 6da5830..e03f835 100644
+--- a/as/src/storage/drv_ssd.c
++++ b/as/src/storage/drv_ssd.c
+@@ -29,6 +29,7 @@
+ 
+ #include <fcntl.h>
+ #include <errno.h>
++#include <pthread.h>
+ #include <stdbool.h>
+ #include <stddef.h>
+ #include <stdint.h>
+@@ -69,6 +70,915 @@
+ #include "storage/storage.h"
+ #include "transaction/rw_utils.h"
+ 
++#ifdef USE_SPDK
++
++#include "spdk/bdev.h"
++#include "spdk/conf.h"
++#include "spdk/env.h"
++#include "spdk/thread.h"
++
++#ifdef USE_LTHREAD
++
++#include "spdk_internal/event.h"
++#include "base/service.h"
++#include "lthread_api.h"
++
++#define EXTRA_SPDK_THREADS 4 /* for write and defrag threads */
++#else
++
++#include "spdk/event.h"
++
++#endif
++
++struct ssd_spdk_thread {
++	struct spdk_thread *thread;
++#ifdef USE_LTHREAD
++	struct spdk_io_channel *ch;
++	struct spdk_io_channel *shadow_ch;
++	struct ssd_spdk_priv *priv;
++	bool used;
++#else
++	pthread_mutex_t mutex;
++#endif
++};
++
++struct ssd_spdk_priv {
++	struct spdk_bdev_desc *desc;
++	struct spdk_bdev_desc *shadow_desc;
++#ifdef USE_LTHREAD
++	bool done;
++#else
++	int rr;
++#endif
++	pthread_cond_t cond;
++	pthread_mutex_t mutex;
++	TAILQ_ENTRY(ssd_spdk_priv) link;
++	struct ssd_spdk_thread *threads;
++};
++
++struct ssd_spdk_data {
++	cf_tid tid;
++	TAILQ_HEAD(, ssd_spdk_priv) devices;
++	pthread_cond_t cond;
++	pthread_mutex_t mutex;
++#ifdef USE_LTHREAD
++	bool done;
++#endif
++};
++
++static struct ssd_spdk_data *ssd_spdk_data;
++
++static void *
++ssd_spdk_dma_alloc(size_t sz)
++{
++	void *ptr = spdk_dma_malloc(sz, HI_IO_MIN_SIZE, NULL);
++
++	if (!ptr)
++		cf_crash(AS_DRV_SSD, "failed to allocate dma buffer");
++
++	return ptr;
++}
++
++static void
++ssd_spdk_dma_free(void *ptr)
++{
++	spdk_dma_free(ptr);
++}
++
++extern uint64_t
++check_file_size(as_namespace *ns, uint64_t file_size, const char *tag);
++
++static void
++ssd_spdk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
++		void *event_ctx)
++{
++	cf_warning(AS_DRV_SSD, "Unsupported bdev event: type %d", type);
++}
++
++static void
++ssd_spdk_init_device_start(void *ctx)
++{
++	drv_ssd *ssd = ctx;
++	struct ssd_spdk_priv *priv = ssd->priv;
++	int rc;
++
++	if (ssd->name && !priv->desc) {
++		const char *name = ssd->name;
++
++		rc = spdk_bdev_open_ext(name, true, ssd_spdk_bdev_event_cb, NULL, &priv->desc);
++		if (rc)
++			cf_crash(AS_DRV_SSD, "unable to open bdev %s", name);
++	}
++
++	if (ssd->shadow_name && !priv->shadow_desc) {
++		const char *name = ssd->shadow_name;
++
++		rc = spdk_bdev_open_ext(name, true, ssd_spdk_bdev_event_cb, NULL, &priv->shadow_desc);
++		if (rc)
++			cf_crash(AS_DRV_SSD, "unable to open bdev %s", name);
++	}
++
++#ifdef USE_LTHREAD
++	priv->done = true;
++#else
++	pthread_mutex_lock(&priv->mutex);
++	pthread_cond_signal(&priv->cond);
++	pthread_mutex_unlock(&priv->mutex);
++#endif
++}
++
++static void
++ssd_empty_header(drv_ssd *ssd, bool is_shadow);
++
++static bool
++ssd_spdk_init_device(as_namespace *ns, drv_ssd *ssd, bool is_shadow)
++{
++	const char *name = is_shadow ? ssd->shadow_name : ssd->name;
++	struct spdk_bdev *bdev;
++	uint64_t io_min_size;
++	uint64_t size;
++	struct ssd_spdk_priv *priv;
++	struct ssd_spdk_thread *thread;
++
++	bdev = spdk_bdev_get_by_name(name);
++	if (!bdev)
++		cf_crash(AS_DRV_SSD, "unable to find bdev with name %s", name);
++
++	priv = ssd->priv;
++	if (!priv) {
++		int n;
++
++		priv = cf_malloc(sizeof(*priv));
++
++#ifdef USE_LTHREAD
++		n = RTE_MAX_LCORE + EXTRA_SPDK_THREADS;
++		priv->done = false;
++#else
++		n = spdk_env_get_core_count();
++#endif
++		pthread_mutex_init(&priv->mutex, NULL);
++		pthread_cond_init(&priv->cond, NULL);
++		priv->desc = NULL;
++		priv->shadow_desc = NULL;
++
++		TAILQ_INSERT_TAIL(&ssd_spdk_data->devices, priv, link);
++
++		priv->threads = cf_calloc(n, sizeof(priv->threads[0]));
++		for (int i = 0; i < n; i++) {
++			thread = &priv->threads[i];
++
++			if ((i >= RTE_MAX_LCORE) || (i < spdk_env_get_core_count())) {
++				thread->thread = spdk_thread_create("aerospike_spdk_rw", NULL);
++				if (!thread->thread)
++					cf_crash(AS_DRV_SSD, "failed to allocate spdk thread");
++			}
++#ifdef USE_LTHREAD
++			thread->priv = priv;
++#else
++			pthread_mutex_init(&thread->mutex, NULL);
++#endif
++		}
++
++		ssd->priv = priv;
++	}
++
++#ifdef USE_LTHREAD
++	thread = &priv->threads[RTE_MAX_LCORE];
++
++	spdk_set_thread(thread->thread);
++	spdk_thread_send_msg(thread->thread, ssd_spdk_init_device_start, ssd);
++	do {
++		spdk_thread_poll(thread->thread, 0, 0);
++	} while (!priv->done);
++#else
++	thread = &priv->threads[0];
++
++	pthread_mutex_lock(&priv->mutex);
++	spdk_thread_send_msg(thread->thread, ssd_spdk_init_device_start, ssd);
++	pthread_cond_wait(&priv->cond, &priv->mutex);
++	pthread_mutex_unlock(&priv->mutex);
++#endif
++
++	io_min_size = spdk_bdev_get_block_size(bdev);
++	size = spdk_bdev_get_num_blocks(bdev) * io_min_size;
++
++	if (ns->storage_filesize)
++		size = MIN(ns->storage_filesize, size);
++	size = check_file_size(ns, size, "usable SPDK device");
++
++	if (!is_shadow) {
++		ssd->file_size = size;
++		ssd->io_min_size = io_min_size;
++		if (ns->cold_start && ns->storage_cold_start_empty) {
++			ssd_empty_header(ssd, false);
++
++			cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s", name);
++		}
++
++		ns->drive_size += ssd->file_size; // increment total storage size
++
++		cf_info(AS_DRV_SSD, "opened device %s: usable size %lu, io-min-size %lu",
++				name, ssd->file_size, ssd->io_min_size);
++
++	} else {
++		if (size < ssd->file_size) {
++			cf_crash(AS_DRV_SSD, "shadow device %s is smaller than main device - %lu < %lu",
++					ssd->shadow_name, size, ssd->file_size);
++		}
++
++		ssd->shadow_io_min_size = io_min_size;
++
++		if (ns->cold_start && ns->storage_cold_start_empty) {
++			ssd_empty_header(ssd, true);
++
++			cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s", name);
++		}
++
++		cf_info(AS_DRV_SSD, "shadow device %s is compatible with main device, shadow-io-min-size %lu",
++				name, ssd->shadow_io_min_size);
++	}
++
++	return true;
++}
++
++static void ssd_spdk_close(ssd_fd_t ssd_fd);
++
++static void
++ssd_spdk_finish_device(drv_ssd *ssd)
++{
++	struct ssd_spdk_priv *priv = ssd->priv;
++
++	if (ssd->priv) {
++		ssd_fd_t fd;
++
++		TAILQ_REMOVE(&ssd_spdk_data->devices, priv, link);
++
++		if (priv->desc) {
++			while (cf_queue_pop(ssd->fd_q, &fd, CF_QUEUE_NOWAIT) == CF_QUEUE_OK) {
++				ssd_spdk_close(fd);
++			}
++			spdk_bdev_close(priv->desc);
++		}
++
++		if (priv->shadow_desc) {
++			while (cf_queue_pop(ssd->shadow_fd_q, &fd, CF_QUEUE_NOWAIT) == CF_QUEUE_OK) {
++				ssd_spdk_close(fd);
++			}
++			spdk_bdev_close(priv->shadow_desc);
++		}
++
++#ifdef USE_LTHREAD
++		for (int i = 0; i < RTE_MAX_LCORE + EXTRA_SPDK_THREADS; i++) {
++#else
++		for (int i = 0; i < spdk_env_get_core_count(); i++) {
++#endif
++			struct ssd_spdk_thread *thread = &priv->threads[i];
++
++			if (thread->thread)
++				spdk_thread_exit(thread->thread);
++		}
++		cf_free(priv->threads);
++
++		cf_free(priv);
++		ssd->priv = NULL;
++	}
++}
++
++enum ssd_spdk_rw {
++	SSD_SPDK_READ,
++	SSD_SPDK_READ_BOUNCE,
++	SSD_SPDK_WRITE,
++	SSD_SPDK_WRITE_BOUNCE,
++};
++
++struct ssd_spdk_fd {
++	struct spdk_bdev_desc *desc;
++	struct ssd_spdk_thread *thread;
++#ifdef USE_LTHREAD
++#else
++	struct spdk_io_channel *ch;
++#endif
++
++	void *bounce;
++	size_t bounce_size;
++
++	enum ssd_spdk_rw rw;
++	void *buf;
++	size_t size;
++	off_t offset;
++
++	bool success;
++#ifdef USE_LTHREAD
++	bool done;
++	struct ssd_spdk_priv *priv;
++#else
++	pthread_cond_t cond;
++	pthread_mutex_t mutex;
++#endif
++};
++
++#ifdef USE_LTHREAD
++
++static struct ssd_spdk_thread *
++ssd_spdk_thread_get(struct ssd_spdk_priv *priv)
++{
++	struct ssd_spdk_thread *thread;
++
++	if (lthread_current()) {
++		thread = &priv->threads[spdk_env_get_current_core()];
++	} else {
++		pthread_mutex_lock(&priv->mutex);
++
++		thread = NULL;
++		do {
++			for (int i = RTE_MAX_LCORE; i < RTE_MAX_LCORE + EXTRA_SPDK_THREADS; i++) {
++				if (!priv->threads[i].used) {
++					thread = &priv->threads[i];
++					break;
++				}
++			}
++			if (!thread)
++				pthread_cond_wait(&priv->cond, &priv->mutex);
++		} while (!thread);
++
++		thread->used = true;
++		pthread_mutex_unlock(&priv->mutex);
++	}
++
++	if (!thread->thread)
++		cf_crash(AS_DRV_SSD, "Invalid spdk thread context");
++
++	spdk_set_thread(thread->thread);
++
++	return thread;
++}
++
++static struct spdk_io_channel *
++ssd_spdk_get_io_channel(struct ssd_spdk_thread *thread, struct spdk_bdev_desc *desc)
++{
++	struct ssd_spdk_priv *priv = thread->priv;
++	struct spdk_io_channel *ch;
++
++	if (priv->desc == desc) {
++		if (thread->ch)
++			return thread->ch;
++	} else if (priv->shadow_desc == desc) {
++		if (thread->shadow_ch)
++			return thread->shadow_ch;
++	} else {
++		cf_crash(AS_DRV_SSD, "Invalid spdk_bdev_desc specified");
++	}
++
++	ch = spdk_bdev_get_io_channel(desc);
++
++	if (priv->desc == desc) {
++		thread->ch = ch;
++	} else if (priv->shadow_desc == desc) {
++		thread->shadow_ch = ch;
++	}
++
++	return ch;
++}
++
++static void
++ssd_spdk_thread_put(struct ssd_spdk_priv *priv, struct ssd_spdk_thread *thread)
++{
++	if (!lthread_current()) {
++		pthread_mutex_lock(&priv->mutex);
++		thread->used = false;
++		pthread_cond_signal(&priv->cond);
++		pthread_mutex_unlock(&priv->mutex);
++	}
++	spdk_set_thread(NULL);
++}
++
++#endif
++
++static ssd_fd_t
++ssd_spdk_open(drv_ssd *ssd, bool is_shadow, int flags)
++{
++	struct ssd_spdk_priv *priv = ssd->priv;
++	struct ssd_spdk_fd *handle;
++	ssd_fd_t ssd_fd = { .fd = -1 };
++
++	handle = cf_malloc(sizeof(*handle));
++
++	handle->desc = is_shadow ? priv->shadow_desc : priv->desc;
++
++#ifdef USE_LTHREAD
++	handle->priv = priv;
++#else
++	pthread_mutex_init(&handle->mutex, NULL);
++	pthread_cond_init(&handle->cond, NULL);
++
++	handle->thread = &priv->threads[(priv->rr++) % spdk_env_get_core_count()];
++
++	handle->ch = NULL;
++#endif
++	handle->bounce_size = 0;
++
++	ssd_fd.handle = handle;
++
++	return ssd_fd;
++}
++
++static void
++ssd_spdk_close_start(void *ctx)
++{
++	struct ssd_spdk_fd *handle = ctx;
++
++	if (handle->bounce_size)
++		spdk_dma_free(handle->bounce);
++#ifdef USE_LTHREAD
++	handle->done = true;
++#else
++	if (handle->ch)
++		spdk_put_io_channel(handle->ch);
++
++	pthread_mutex_lock(&handle->mutex);
++	pthread_cond_signal(&handle->cond);
++	pthread_mutex_unlock(&handle->mutex);
++#endif
++}
++
++static void
++ssd_spdk_close(ssd_fd_t ssd_fd)
++{
++	struct ssd_spdk_fd *handle = ssd_fd.handle;
++
++#ifdef USE_LTHREAD
++	struct ssd_spdk_thread *thread;
++
++	handle->done = false;
++	thread = ssd_spdk_thread_get(handle->priv);
++	spdk_thread_send_msg(thread->thread, ssd_spdk_close_start, handle);
++
++	do {
++		spdk_thread_poll(thread->thread, 0, 0);
++		if (lthread_current())
++			lthread_yield();
++		else
++			sched_yield();
++	} while (!handle->done);
++
++	ssd_spdk_thread_put(handle->priv, thread);
++#else
++	pthread_mutex_lock(&handle->mutex);
++
++	pthread_mutex_lock(&handle->thread->mutex);
++	spdk_thread_send_msg(handle->thread->thread, ssd_spdk_close_start, handle);
++	pthread_mutex_unlock(&handle->thread->mutex);
++
++	pthread_cond_wait(&handle->cond, &handle->mutex);
++	pthread_mutex_unlock(&handle->mutex);
++#endif
++	cf_free(handle);
++}
++
++static void
++ssd_spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
++{
++	struct ssd_spdk_fd *handle = cb_arg;
++
++	if (handle->rw == SSD_SPDK_READ_BOUNCE)
++		memcpy(handle->buf, handle->bounce, handle->size);
++
++#ifdef USE_LTHREAD
++	handle->success = success;
++	handle->done = true;
++#else
++	pthread_mutex_lock(&handle->mutex);
++	handle->success = success;
++	pthread_cond_signal(&handle->cond);
++	pthread_mutex_unlock(&handle->mutex);
++#endif
++
++	spdk_bdev_free_io(bdev_io);
++}
++
++static void
++ssd_spdk_prw_start(void *ctx)
++{
++	struct ssd_spdk_fd *handle = ctx;
++	void *buf = handle->buf;
++	struct spdk_io_channel *ch;
++
++#ifdef USE_LTHREAD
++	ch = ssd_spdk_get_io_channel(handle->thread, handle->desc);
++	if (!ch) {
++		handle->success = false;
++		handle->done = true;
++		return;
++	}
++#else
++	if (!handle->ch)
++		handle->ch = spdk_bdev_get_io_channel(handle->desc);
++
++	if (!handle->ch) {
++		pthread_mutex_lock(&handle->mutex);
++		handle->success = false;
++		pthread_cond_signal(&handle->cond);
++		pthread_mutex_unlock(&handle->mutex);
++
++		return;
++	}
++	ch = handle->ch;
++#endif
++
++	if (handle->rw == SSD_SPDK_READ_BOUNCE || handle->rw == SSD_SPDK_WRITE_BOUNCE) {
++		if (handle->bounce_size < handle->size) {
++			if (handle->bounce_size) {
++				spdk_dma_free(handle->bounce);
++			}
++			handle->bounce = ssd_spdk_dma_alloc(handle->size);
++			handle->bounce_size = handle->size;
++		}
++		if (handle->rw == SSD_SPDK_WRITE_BOUNCE)
++			memcpy(handle->bounce, handle->buf, handle->size);
++
++		buf = handle->bounce;
++	}
++
++	switch (handle->rw) {
++	case SSD_SPDK_WRITE:
++	case SSD_SPDK_WRITE_BOUNCE:
++		spdk_bdev_write(handle->desc, ch, buf, handle->offset, handle->size,
++				ssd_spdk_bdev_io_complete, handle);
++		break;
++	case SSD_SPDK_READ:
++	case SSD_SPDK_READ_BOUNCE:
++		spdk_bdev_read(handle->desc, ch, buf, handle->offset, handle->size,
++				ssd_spdk_bdev_io_complete, handle);
++		break;
++	}
++}
++
++static bool
++ssd_spdk_prw(ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset, enum ssd_spdk_rw rw)
++{
++	struct ssd_spdk_fd *handle = ssd_fd.handle;
++
++#ifdef USE_LTHREAD
++	handle->done = false;
++#else
++	pthread_mutex_lock(&handle->mutex);
++#endif
++
++	handle->rw = rw;
++	handle->buf = buf;
++	handle->size = size;
++	handle->offset = offset;
++	handle->success = false;
++
++#ifdef USE_LTHREAD
++	handle->thread = ssd_spdk_thread_get(handle->priv);
++
++	spdk_thread_send_msg(handle->thread->thread, ssd_spdk_prw_start, handle);
++
++	do {
++		spdk_thread_poll(handle->thread->thread, 0, 0);
++		if (lthread_current())
++			lthread_yield();
++		else
++			sched_yield();
++	} while (!handle->done);
++
++	ssd_spdk_thread_put(handle->priv, handle->thread);
++	handle->thread = NULL;
++#else
++	pthread_mutex_lock(&handle->thread->mutex);
++	spdk_thread_send_msg(handle->thread->thread, ssd_spdk_prw_start, handle);
++	pthread_mutex_unlock(&handle->thread->mutex);
++
++	pthread_cond_wait(&handle->cond, &handle->mutex);
++	pthread_mutex_unlock(&handle->mutex);
++#endif
++
++	return handle->success;
++}
++
++static bool
++ssd_spdk_pread(ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset, bool bounce)
++{
++	return ssd_spdk_prw(ssd_fd, buf, size, offset, bounce ? SSD_SPDK_READ_BOUNCE : SSD_SPDK_READ);
++}
++
++static bool
++ssd_spdk_pwrite(ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset, bool bounce)
++{
++	return ssd_spdk_prw(ssd_fd, buf, size, offset, bounce ? SSD_SPDK_WRITE_BOUNCE : SSD_SPDK_WRITE);
++}
++
++#ifndef USE_LTHREAD
++
++static void
++ssd_spdk_start(void *arg)
++{
++	struct ssd_spdk_data *data = arg;
++
++	pthread_mutex_lock(&data->mutex);
++	pthread_cond_signal(&data->cond);
++	pthread_mutex_unlock(&data->mutex);
++}
++
++extern void as_sig_handle_term(int sig_num, siginfo_t *info, void *ctx);
++
++static void
++ssd_spdk_shutdown_cb(void)
++{
++	as_sig_handle_term(SIGTERM, NULL, NULL);
++}
++
++static void *
++run_ssd_spdk_start(void *arg)
++{
++	struct ssd_spdk_data *data = arg;
++	struct spdk_app_opts opts = {};
++	int rc;
++
++	cf_info(AS_DRV_SSD, "spdk conf is %s", g_config.spdk_json_conf);
++
++	spdk_app_opts_init(&opts, sizeof(opts));
++	opts.name = "aerospike";
++	if (g_config.service_lcores)
++		opts.reactor_mask = g_config.service_lcores;
++	opts.shutdown_cb = ssd_spdk_shutdown_cb;
++	opts.json_config_file = g_config.spdk_json_conf;
++
++	rc = spdk_app_start(&opts, ssd_spdk_start, data);
++	if (rc) {
++		cf_crash(AS_DRV_SSD, "Error starting spdk application");
++	} else {
++		spdk_app_fini();
++	}
++
++	return NULL;
++}
++
++static void
++ssd_spdk_shutdown(void)
++{
++	if (ssd_spdk_data && TAILQ_EMPTY(&ssd_spdk_data->devices)) {
++		spdk_app_stop(0);
++		cf_thread_join(ssd_spdk_data->tid);
++	}
++}
++
++#else
++
++static void
++ssd_spdk_bdev_init_done(int rc, void *cb_arg)
++{
++	struct ssd_spdk_data *data = cb_arg;
++
++	data->done = true;
++}
++
++static void
++ssd_spdk_bdev_init_start(void *arg)
++{
++	struct ssd_spdk_data *data = arg;
++
++	spdk_app_json_config_load(g_config.spdk_json_conf, SPDK_DEFAULT_RPC_ADDR,
++				ssd_spdk_bdev_init_done, data, true);
++}
++
++static void *
++run_ssd_spdk_start(void *arg)
++{
++	struct spdk_env_opts opts;
++	struct spdk_thread *thread;
++
++	cf_info(AS_DRV_SSD, "spdk conf is %s", g_config.spdk_json_conf);
++
++	spdk_env_opts_init(&opts);
++	opts.name = "aerospike";
++	if (g_config.service_lcores) {
++		opts.core_mask = g_config.service_lcores;
++		opts.env_context = cf_strdup("--log-level=lib.eal:7");
++	}
++
++	if (spdk_env_init(&opts) < 0) {
++		cf_crash(AS_DRV_SSD, "unable to initialize SPDK env");
++	}
++
++	spdk_thread_lib_init(NULL, 0);
++
++	thread = spdk_thread_create("spdk_aerospike", NULL);
++	if (!thread) {
++		cf_crash(AS_DRV_SSD, "failed to allocate spdk thread");
++	}
++
++	ssd_spdk_data->done = false;
++	spdk_thread_send_msg(thread, ssd_spdk_bdev_init_start, ssd_spdk_data);
++
++	do {
++		spdk_thread_poll(thread, 0, 0);
++	} while (!ssd_spdk_data->done);
++
++	pthread_mutex_lock(&ssd_spdk_data->mutex);
++	pthread_cond_signal(&ssd_spdk_data->cond);
++	pthread_mutex_unlock(&ssd_spdk_data->mutex);
++
++	as_monitor_init(&as_service_run_monitor);
++	as_monitor_begin(&as_service_run_monitor);
++	as_monitor_wait(&as_service_run_monitor);
++
++	as_service_run_threads();
++
++	return NULL;
++}
++
++static void
++ssd_spdk_shutdown(void)
++{
++}
++
++#endif
++
++static void
++ssd_spdk_init(void)
++{
++	if (!g_config.spdk_json_conf || ssd_spdk_data) {
++		return;
++	}
++
++	ssd_spdk_data = cf_malloc(sizeof(*ssd_spdk_data));
++
++	TAILQ_INIT(&ssd_spdk_data->devices);
++	pthread_mutex_init(&ssd_spdk_data->mutex, NULL);
++	pthread_cond_init(&ssd_spdk_data->cond, NULL);
++
++	pthread_mutex_lock(&ssd_spdk_data->mutex);
++	ssd_spdk_data->tid = cf_thread_create_joinable(run_ssd_spdk_start, ssd_spdk_data);
++	pthread_cond_wait(&ssd_spdk_data->cond, &ssd_spdk_data->mutex);
++	pthread_mutex_unlock(&ssd_spdk_data->mutex);
++}
++
++static const struct ssd_ops ssd_spdk_ops = {
++	.name = "spdk-bdev",
++	.init = ssd_spdk_init,
++	.shutdown = ssd_spdk_shutdown,
++	.init_device = ssd_spdk_init_device,
++	.finish_device = ssd_spdk_finish_device,
++	.dma_alloc = ssd_spdk_dma_alloc,
++	.dma_free = ssd_spdk_dma_free,
++	.open = ssd_spdk_open,
++	.close = ssd_spdk_close,
++	.pread = ssd_spdk_pread,
++	.pwrite = ssd_spdk_pwrite,
++};
++
++#else /* USE_SPDK */
++
++static bool
++ssd_spdk_init_device(as_namespace *ns, drv_ssd *ssd, bool is_shadow)
++{
++	cf_crash(AS_DRV_SSD, "SPDK support is disabled");
++}
++
++static const struct ssd_ops ssd_spdk_ops = {
++	.name = "spdk-bdev",
++	.init_device = ssd_spdk_init_device,
++};
++
++#endif /* USE_SPDK */
++
++static bool
++ssd_init_device(as_namespace *ns, drv_ssd *ssd, bool is_shadow)
++{
++	if (ssd->ops->init_device)
++		return ssd->ops->init_device(ns, ssd, is_shadow);
++
++	return true;
++}
++
++static void
++ssd_finish_device(drv_ssd *ssd)
++{
++	if (ssd->ops->finish_device)
++		ssd->ops->finish_device(ssd);
++}
++
++static inline void *
++ssd_dma_alloc(drv_ssd *ssd, size_t sz)
++{
++	if (ssd->ops->dma_alloc) {
++		return ssd->ops->dma_alloc(sz);
++	}
++	return cf_valloc(sz);
++}
++
++static inline void
++ssd_dma_free(drv_ssd *ssd, void *ptr)
++{
++	if (ssd->ops->dma_free) {
++		return ssd->ops->dma_free(ptr);
++	}
++	cf_free(ptr);
++}
++
++static ssd_fd_t
++ssd_open(drv_ssd *ssd, bool is_shadow, int flags)
++{
++	return ssd->ops->open(ssd, is_shadow, flags);
++}
++
++static void
++ssd_close(drv_ssd *ssd, ssd_fd_t ssd_fd)
++{
++	ssd->ops->close(ssd_fd);
++}
++
++static bool
++ssd_pread_all(drv_ssd *ssd, ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset)
++{
++	return ssd->ops->pread(ssd_fd, buf, size, offset, false);
++}
++
++static bool
++ssd_pread_all_bounce(drv_ssd *ssd, ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset)
++{
++	return ssd->ops->pread(ssd_fd, buf, size, offset, true);
++}
++
++static bool
++ssd_pwrite_all(drv_ssd *ssd, ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset)
++{
++	return ssd->ops->pwrite(ssd_fd, buf, size, offset, false);
++}
++
++static bool
++ssd_pwrite_all_bounce(drv_ssd *ssd, ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset)
++{
++	return ssd->ops->pwrite(ssd_fd, buf, size, offset, true);
++}
++
++static bool
++ssd_fd_is_error(ssd_fd_t ssd_fd)
++{
++	return ssd_fd.fd == -1;
++}
++
++static ssd_fd_t
++ssd_posix_open(drv_ssd *ssd, bool is_shadow, int flags)
++{
++	ssd_fd_t ssd_fd;
++
++	ssd_fd.fd = open(is_shadow ? ssd->shadow_name : ssd->name, flags, S_IRUSR | S_IWUSR);
++
++	return ssd_fd;
++}
++
++static void
++ssd_posix_close(ssd_fd_t ssd_fd)
++{
++	close(ssd_fd.fd);
++}
++
++static bool
++ssd_posix_pread(ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset, bool bounce)
++{
++	return pread_all(ssd_fd.fd, buf, size, offset);
++}
++
++static bool
++ssd_posix_pwrite(ssd_fd_t ssd_fd, void *buf, size_t size, off_t offset, bool bounce)
++{
++	return pwrite_all(ssd_fd.fd, buf, size, offset);
++}
++
++static const struct ssd_ops ssd_posix_ops = {
++	.name = "posix",
++	.open = ssd_posix_open,
++	.close = ssd_posix_close,
++	.pread = ssd_posix_pread,
++	.pwrite = ssd_posix_pwrite,
++};
++
++static const struct ssd_ops *ssd_backends[] = {
++	&ssd_spdk_ops,
++	&ssd_posix_ops,
++	NULL,
++};
++
++static void ssd_backends_init(void)
++{
++	for (const struct ssd_ops **ops = &ssd_backends[0]; *ops; ops++) {
++		if ((*ops)->init) {
++			(*ops)->init();
++		}
++	}
++}
++
++static void ssd_backends_shutdown(void)
++{
++	for (const struct ssd_ops **ops = &ssd_backends[0]; *ops; ops++) {
++		if ((*ops)->shutdown) {
++			(*ops)->shutdown();
++		}
++	}
++}
+ 
+ //==========================================================
+ // Constants.
+@@ -82,16 +992,20 @@
+ //
+ 
+ // Get an open file descriptor from the pool, or a fresh one if necessary.
+-int
++ssd_fd_t
+ ssd_fd_get(drv_ssd *ssd)
+ {
+-	int fd = -1;
+-	int rv = cf_queue_pop(ssd->fd_q, (void*)&fd, CF_QUEUE_NOWAIT);
++	ssd_fd_t fd = { .fd = -1 };
++	int rv = CF_QUEUE_EMPTY;
++
++	if (ssd->ns->storage_recycle_fds) {
++		rv = cf_queue_pop(ssd->fd_q, (void*)&fd, CF_QUEUE_NOWAIT);
++	}
+ 
+ 	if (rv != CF_QUEUE_OK) {
+-		fd = open(ssd->name, ssd->open_flag, S_IRUSR | S_IWUSR);
++		fd = ssd_open(ssd, false, ssd->open_flag);
+ 
+-		if (-1 == fd) {
++		if (ssd_fd_is_error(fd)) {
+ 			cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED open: errno %d (%s)",
+ 					ssd->name, errno, cf_strerror(errno));
+ 		}
+@@ -101,17 +1015,20 @@ ssd_fd_get(drv_ssd *ssd)
+ }
+ 
+ 
+-int
++ssd_fd_t
+ ssd_fd_cache_get(drv_ssd *ssd)
+ {
+-	int fd = -1;
+-	int rv = cf_queue_pop(ssd->fd_cache_q, (void*)&fd, CF_QUEUE_NOWAIT);
++	ssd_fd_t fd = { .fd = -1 };
++	int rv = CF_QUEUE_EMPTY;
++
++	if (ssd->ns->storage_recycle_fds) {
++		rv = cf_queue_pop(ssd->fd_cache_q, (void*)&fd, CF_QUEUE_NOWAIT);
++	}
+ 
+ 	if (rv != CF_QUEUE_OK) {
+-		fd = open(ssd->name, ssd->open_flag & ~(O_DIRECT | O_DSYNC),
+-				S_IRUSR | S_IWUSR);
++		fd = ssd_open(ssd, false, ssd->open_flag & ~(O_DIRECT | O_DSYNC));
+ 
+-		if (-1 == fd) {
++		if (ssd_fd_is_error(fd)) {
+ 			cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED open: errno %d (%s)",
+ 					ssd->name, errno, cf_strerror(errno));
+ 		}
+@@ -121,16 +1038,20 @@ ssd_fd_cache_get(drv_ssd *ssd)
+ }
+ 
+ 
+-int
++ssd_fd_t
+ ssd_shadow_fd_get(drv_ssd *ssd)
+ {
+-	int fd = -1;
+-	int rv = cf_queue_pop(ssd->shadow_fd_q, (void*)&fd, CF_QUEUE_NOWAIT);
++	ssd_fd_t fd = { .fd = -1 };
++	int rv = CF_QUEUE_EMPTY;
++
++	if (ssd->ns->storage_recycle_fds) {
++		rv = cf_queue_pop(ssd->shadow_fd_q, (void*)&fd, CF_QUEUE_NOWAIT);
++	}
+ 
+ 	if (rv != CF_QUEUE_OK) {
+-		fd = open(ssd->shadow_name, ssd->open_flag, S_IRUSR | S_IWUSR);
++		fd = ssd_open(ssd, true, ssd->open_flag);
+ 
+-		if (-1 == fd) {
++		if (ssd_fd_is_error(fd)) {
+ 			cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED open: errno %d (%s)",
+ 					ssd->shadow_name, errno, cf_strerror(errno));
+ 		}
+@@ -142,23 +1063,35 @@ ssd_shadow_fd_get(drv_ssd *ssd)
+ 
+ // Save an open file descriptor in the pool
+ void
+-ssd_fd_put(drv_ssd *ssd, int fd)
++ssd_fd_put(drv_ssd *ssd, ssd_fd_t fd)
+ {
+-	cf_queue_push(ssd->fd_q, (void*)&fd);
++	if (ssd->ns->storage_recycle_fds) {
++		cf_queue_push(ssd->fd_q, (void*)&fd);
++	} else {
++		ssd_close(ssd, fd);
++	}
+ }
+ 
+ 
+ static inline void
+-ssd_fd_cache_put(drv_ssd *ssd, int fd)
++ssd_fd_cache_put(drv_ssd *ssd, ssd_fd_t fd)
+ {
+-	cf_queue_push(ssd->fd_cache_q, (void*)&fd);
++	if (ssd->ns->storage_recycle_fds) {
++		cf_queue_push(ssd->fd_cache_q, (void*)&fd);
++	} else {
++		ssd_close(ssd, fd);
++	}
+ }
+ 
+ 
+ static inline void
+-ssd_shadow_fd_put(drv_ssd *ssd, int fd)
++ssd_shadow_fd_put(drv_ssd *ssd, ssd_fd_t fd)
+ {
+-	cf_queue_push(ssd->shadow_fd_q, (void*)&fd);
++	if (ssd->ns->storage_recycle_fds) {
++		cf_queue_push(ssd->shadow_fd_q, (void*)&fd);
++	} else {
++		ssd_close(ssd, fd);
++	}
+ }
+ 
+ 
+@@ -299,7 +1232,7 @@ swb_create(drv_ssd *ssd)
+ {
+ 	ssd_write_buf *swb = (ssd_write_buf*)cf_malloc(sizeof(ssd_write_buf));
+ 
+-	swb->buf = cf_valloc(ssd->write_block_size);
++	swb->buf = ssd_dma_alloc(ssd, ssd->write_block_size);
+ 
+ 	swb->n_vacated = 0;
+ 	swb->vacated_capacity = VACATED_CAPACITY_STEP;
+@@ -313,7 +1246,7 @@ static inline void
+ swb_destroy(ssd_write_buf *swb)
+ {
+ 	cf_free(swb->vacated_wblocks);
+-	cf_free(swb->buf);
++	ssd_dma_free(swb->ssd, swb->buf);
+ 	cf_free(swb);
+ }
+ 
+@@ -507,6 +1440,7 @@ ssd_block_free(drv_ssd *ssd, uint64_t rblock_id, uint32_t n_rblocks, char *msg)
+ 			rblock_id);
+ 
+ 	cf_assert(start_offset >= DRV_HEADER_SIZE &&
++			start_offset >= ssd->write_block_size &&
+ 			wblock_id < ssd->n_wblocks && wblock_id == end_wblock_id,
+ 			AS_DRV_SSD, "%s: %s: freeing bad range rblock_id %lu n_rblocks %u",
+ 			ssd->name, msg, rblock_id, n_rblocks);
+@@ -734,16 +1668,15 @@ ssd_defrag_wblock(drv_ssd *ssd, uint32_t wblock_id, uint8_t *read_buf)
+ 		goto Finished;
+ 	}
+ 
+-	int fd = ssd_fd_get(ssd);
++	ssd_fd_t fd = ssd_fd_get(ssd);
+ 	uint64_t file_offset = WBLOCK_ID_TO_OFFSET(ssd, wblock_id);
+ 
+ 	uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0;
+ 
+-	if (! pread_all(fd, read_buf, ssd->write_block_size, (off_t)file_offset)) {
++	if (! ssd_pread_all(ssd, fd, read_buf, ssd->write_block_size, (off_t)file_offset)) {
+ 		cf_warning(AS_DRV_SSD, "%s: read failed: errno %d (%s)", ssd->name,
+ 				errno, cf_strerror(errno));
+-		close(fd);
+-		fd = -1;
++		ssd_close(ssd, fd);
+ 		goto Finished;
+ 	}
+ 
+@@ -830,7 +1763,7 @@ run_defrag(void *pv_data)
+ 	drv_ssd *ssd = (drv_ssd*)pv_data;
+ 	as_namespace *ns = ssd->ns;
+ 	uint32_t wblock_id;
+-	uint8_t *read_buf = cf_valloc(ssd->write_block_size);
++	uint8_t *read_buf = ssd_dma_alloc(ssd, ssd->write_block_size);
+ 
+ 	while (true) {
+ 		uint32_t q_min = as_load_uint32(&ns->storage_defrag_queue_min);
+@@ -859,6 +1792,7 @@ run_defrag(void *pv_data)
+ 			usleep(1000);
+ 		}
+ 	}
++	ssd_dma_free(ssd, read_buf);
+ 
+ 	return NULL;
+ }
+@@ -1133,16 +2067,16 @@ ssd_read_record(as_storage_rd *rd, bool pickle_only)
+ 
+ 		read_buf = cf_valloc(read_size);
+ 
+-		int fd = rd->read_page_cache ? ssd_fd_cache_get(ssd) : ssd_fd_get(ssd);
++		ssd_fd_t fd = rd->read_page_cache ? ssd_fd_cache_get(ssd) : ssd_fd_get(ssd);
+ 
+ 		uint64_t start_ns = ns->storage_benchmarks_enabled ? cf_getns() : 0;
+ 		uint64_t start_us = as_health_sample_device_read() ? cf_getus() : 0;
+ 
+-		if (! pread_all(fd, read_buf, read_size, (off_t)read_offset)) {
++		if (! ssd_pread_all_bounce(ssd, fd, read_buf, read_size, (off_t)read_offset)) {
+ 			cf_warning(AS_DRV_SSD, "%s: read failed: size %lu: errno %d (%s)",
+ 					ssd->name, read_size, errno, cf_strerror(errno));
+ 			cf_free(read_buf);
+-			close(fd);
++			ssd_close(ssd, fd);
+ 			return -1;
+ 		}
+ 
+@@ -1305,12 +2239,12 @@ ssd_flush_swb(drv_ssd *ssd, ssd_write_buf *swb)
+ 		;
+ 	}
+ 
+-	int fd = ssd_fd_get(ssd);
++	ssd_fd_t fd = ssd_fd_get(ssd);
+ 	off_t write_offset = (off_t)WBLOCK_ID_TO_OFFSET(ssd, swb->wblock_id);
+ 
+ 	uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0;
+ 
+-	if (! pwrite_all(fd, swb->buf, ssd->write_block_size, write_offset)) {
++	if (! ssd_pwrite_all(ssd, fd, swb->buf, ssd->write_block_size, write_offset)) {
+ 		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+ 				ssd->name, errno, cf_strerror(errno));
+ 	}
+@@ -1326,12 +2260,12 @@ ssd_flush_swb(drv_ssd *ssd, ssd_write_buf *swb)
+ void
+ ssd_shadow_flush_swb(drv_ssd *ssd, ssd_write_buf *swb)
+ {
+-	int fd = ssd_shadow_fd_get(ssd);
++	ssd_fd_t fd = ssd_shadow_fd_get(ssd);
+ 	off_t write_offset = (off_t)WBLOCK_ID_TO_OFFSET(ssd, swb->wblock_id);
+ 
+ 	uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0;
+ 
+-	if (! pwrite_all(fd, swb->buf, ssd->write_block_size, write_offset)) {
++	if (! ssd_pwrite_all(ssd, fd, swb->buf, ssd->write_block_size, write_offset)) {
+ 		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+ 				ssd->shadow_name, errno, cf_strerror(errno));
+ 	}
+@@ -1681,7 +2615,7 @@ as_storage_dump_wb_summary_ssd(const as_namespace *ns)
+ 	// Note: This is a sparse array that could be more efficiently stored.
+ 	// (In addition, ranges of block sizes could be binned together to
+ 	// compress the histogram, rather than using one bin per block size.)
+-	uint32_t *wb_hist = cf_calloc(1, sizeof(uint32_t) * MAX_WRITE_BLOCK_SIZE);
++	uint32_t *wb_hist = cf_calloc(1, sizeof(uint32_t) * ns->storage_write_block_size);
+ 
+ 	for (uint32_t d = 0; d < ssds->n_ssds; d++) {
+ 		drv_ssd *ssd = &ssds->ssds[d];
+@@ -1741,7 +2675,7 @@ as_storage_dump_wb_summary_ssd(const as_namespace *ns)
+ 			(defraggable_sz + non_defraggable_sz) /
+ 					MAX(1, (total_num_defraggable + total_num_above_wm)));
+ 
+-	for (uint32_t i = 0; i < MAX_WRITE_BLOCK_SIZE; i++) {
++	for (uint32_t i = 0; i < ns->storage_write_block_size; i++) {
+ 		if (wb_hist[i] > 0) {
+ 			cf_info(AS_DRV_SSD, "WBH: %u block%s of size %u bytes",
+ 					wb_hist[i], (wb_hist[i] != 1 ? "s" : ""), i);
+@@ -2115,7 +3049,7 @@ ssd_read_header(drv_ssd *ssd)
+ 	bool use_shadow = ns->cold_start && ssd->shadow_name;
+ 
+ 	const char *ssd_name;
+-	int fd;
++	ssd_fd_t fd;
+ 	size_t read_size;
+ 
+ 	if (use_shadow) {
+@@ -2131,7 +3065,7 @@ ssd_read_header(drv_ssd *ssd)
+ 
+ 	drv_header *header = cf_valloc(read_size);
+ 
+-	if (! pread_all(fd, (void*)header, read_size, 0)) {
++	if (! ssd_pread_all_bounce(ssd, fd, (void *)header, read_size, 0)) {
+ 		cf_crash(AS_DRV_SSD, "%s: read failed: errno %d (%s)", ssd_name, errno,
+ 				cf_strerror(errno));
+ 	}
+@@ -2186,6 +3120,7 @@ ssd_read_header(drv_ssd *ssd)
+ 
+ 	if (header->unique.pristine_offset != 0 && // always 0 before 4.6
+ 			(header->unique.pristine_offset < DRV_HEADER_SIZE ||
++					header->unique.pristine_offset < ssd->write_block_size ||
+ 					header->unique.pristine_offset > ssd->file_size)) {
+ 		cf_crash(AS_DRV_SSD, "%s: bad pristine offset %lu", ssd_name,
+ 				header->unique.pristine_offset);
+@@ -2222,7 +3157,7 @@ ssd_init_header(as_namespace *ns, drv_ssd *ssd)
+ 
+ 
+ void
+-ssd_empty_header(int fd, const char* device_name)
++ssd_posix_empty_header(int fd, const char* device_name)
+ {
+ 	void *h = cf_valloc(DRV_HEADER_SIZE);
+ 
+@@ -2236,6 +3171,28 @@ ssd_empty_header(int fd, const char* device_name)
+ 	cf_free(h);
+ }
+ 
++#ifdef USE_SPDK
++
++static void
++ssd_empty_header(drv_ssd *ssd, bool is_shadow)
++{
++	ssd_fd_t fd;
++	void *h = cf_valloc(DRV_HEADER_SIZE);
++
++	memset(h, 0, DRV_HEADER_SIZE);
++
++	fd = ssd_open(ssd, is_shadow, ssd->open_flag);
++	if (ssd_fd_is_error(fd))
++		cf_crash(AS_DRV_SSD, "DEVICE FAILED open");
++
++	if (!ssd_pwrite_all_bounce(ssd, fd, h, DRV_HEADER_SIZE, 0))
++		cf_crash(AS_DRV_SSD, "DEVICE FAILED write");
++
++	ssd_close(ssd, fd);
++	cf_free(h);
++}
++
++#endif /* USE_SPDK */
+ 
+ void
+ ssd_write_header(drv_ssd *ssd, uint8_t *header, uint8_t *from, size_t size)
+@@ -2248,9 +3205,9 @@ ssd_write_header(drv_ssd *ssd, uint8_t *header, uint8_t *from, size_t size)
+ 	uint8_t *flush = header + flush_offset;
+ 	size_t flush_sz = flush_end_offset - flush_offset;
+ 
+-	int fd = ssd_fd_get(ssd);
++	ssd_fd_t fd = ssd_fd_get(ssd);
+ 
+-	if (! pwrite_all(fd, (void*)flush, flush_sz, flush_offset)) {
++	if (! ssd_pwrite_all_bounce(ssd, fd, (void*)flush, flush_sz, flush_offset)) {
+ 		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+ 				ssd->name, errno, cf_strerror(errno));
+ 	}
+@@ -2269,7 +3226,7 @@ ssd_write_header(drv_ssd *ssd, uint8_t *header, uint8_t *from, size_t size)
+ 
+ 	fd = ssd_shadow_fd_get(ssd);
+ 
+-	if (! pwrite_all(fd, (void*)flush, flush_sz, flush_offset)) {
++	if (! ssd_pwrite_all_bounce(ssd, fd, (void*)flush, flush_sz, flush_offset)) {
+ 		cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+ 				ssd->shadow_name, errno, cf_strerror(errno));
+ 	}
+@@ -2517,29 +3474,30 @@ ssd_cold_start_sweep(drv_ssds *ssds, drv_ssd *ssd)
+ {
+ 	size_t wblock_size = ssd->write_block_size;
+ 
+-	uint8_t *buf = cf_valloc(wblock_size);
++	uint8_t *buf = ssd_dma_alloc(ssd, wblock_size);
+ 
+ 	bool read_shadow = ssd->shadow_name;
+ 	const char *read_ssd_name = read_shadow ? ssd->shadow_name : ssd->name;
+-	int fd = read_shadow ? ssd_shadow_fd_get(ssd) : ssd_fd_get(ssd);
+-	int write_fd = read_shadow ? ssd_fd_get(ssd) : -1;
++	ssd_fd_t fd = read_shadow ? ssd_shadow_fd_get(ssd) : ssd_fd_get(ssd);
++	ssd_fd_t write_fd = read_shadow ? ssd_fd_get(ssd) : (ssd_fd_t) { .fd = -1 };
+ 
+ 	// Loop over all wblocks, unless we encounter 10 contiguous unused wblocks.
+ 
+ 	ssd->sweep_wblock_id = ssd->first_wblock_id;
+ 
+-	uint64_t file_offset = DRV_HEADER_SIZE;
++	uint64_t file_offset = ssd->write_block_size <= DRV_HEADER_SIZE ?
++		DRV_HEADER_SIZE : ssd->write_block_size;
+ 	uint32_t n_unused_wblocks = 0;
+ 
+ 	bool prefetch = cf_arenax_want_prefetch(ssd->ns->arena);
+ 
+ 	while (file_offset < ssd->file_size && n_unused_wblocks < 10) {
+-		if (! pread_all(fd, buf, wblock_size, (off_t)file_offset)) {
++		if (! ssd_pread_all(ssd, fd, buf, wblock_size, (off_t)file_offset)) {
+ 			cf_crash(AS_DRV_SSD, "%s: read failed: errno %d (%s)",
+ 					read_ssd_name, errno, cf_strerror(errno));
+ 		}
+ 
+-		if (read_shadow && ! pwrite_all(write_fd, (void*)buf, wblock_size,
++		if (read_shadow && ! ssd_pwrite_all(ssd, write_fd, (void*)buf, wblock_size,
+ 				(off_t)file_offset)) {
+ 			cf_crash(AS_DRV_SSD, "%s: write failed: errno %d (%s)", ssd->name,
+ 					errno, cf_strerror(errno));
+@@ -2614,15 +3572,15 @@ ssd_cold_start_sweep(drv_ssds *ssds, drv_ssd *ssd)
+ 
+ 	ssd->sweep_wblock_id = (uint32_t)(ssd->file_size / wblock_size);
+ 
+-	if (fd != -1) {
++	if (!ssd_fd_is_error(fd)) {
+ 		read_shadow ? ssd_shadow_fd_put(ssd, fd) : ssd_fd_put(ssd, fd);
+ 	}
+ 
+-	if (write_fd != -1) {
++	if (!ssd_fd_is_error(write_fd)) {
+ 		ssd_fd_put(ssd, write_fd);
+ 	}
+ 
+-	cf_free(buf);
++	ssd_dma_free(ssd, buf);
+ }
+ 
+ 
+@@ -2740,7 +3698,7 @@ si_startup_sweep(drv_ssds* ssds, drv_ssd* ssd)
+ 	size_t wblock_size = ssd->write_block_size;
+ 
+ 	uint8_t* buf = cf_valloc(wblock_size);
+-	int fd = ssd_fd_get(ssd);
++	ssd_fd_t fd = ssd_fd_get(ssd);
+ 	uint64_t file_offset = DRV_HEADER_SIZE;
+ 
+ 	bool prefetch = cf_arenax_want_prefetch(ssd->ns->arena);
+@@ -2755,7 +3713,7 @@ si_startup_sweep(drv_ssds* ssds, drv_ssd* ssd)
+ 			continue;
+ 		}
+ 
+-		if (! pread_all(fd, buf, wblock_size, (off_t)file_offset)) {
++		if (! ssd_pread_all(ssd, fd, buf, wblock_size, (off_t)file_offset)) {
+ 			cf_crash(AS_DRV_SSD, "%s: read failed: errno %d (%s)", ssd->name,
+ 					errno, cf_strerror(errno));
+ 		}
+@@ -3148,18 +4106,20 @@ ssd_init_synchronous(drv_ssds *ssds)
+ }
+ 
+ 
+-static uint64_t
++uint64_t
+ check_file_size(as_namespace *ns, uint64_t file_size, const char *tag)
+ {
++	uint64_t first_wblock_offset = ns->storage_write_block_size <= DRV_HEADER_SIZE ?
++			DRV_HEADER_SIZE : ns->storage_write_block_size;
+ 	cf_assert(sizeof(off_t) > 4, AS_DRV_SSD, "this OS supports only 32-bit (4g) files - compile with 64 bit offsets");
+ 
+-	if (file_size > DRV_HEADER_SIZE) {
++	if (file_size > first_wblock_offset) {
+ 		off_t unusable_size =
+-				(file_size - DRV_HEADER_SIZE) % ns->storage_write_block_size;
++				(file_size - first_wblock_offset) % ns->storage_write_block_size;
+ 
+ 		if (unusable_size != 0) {
+-			cf_info(AS_DRV_SSD, "%s size must be header size %u + multiple of %u, rounding down",
+-					tag, DRV_HEADER_SIZE, ns->storage_write_block_size);
++			cf_info(AS_DRV_SSD, "%s size must be header size %lu + multiple of %u, rounding down",
++					tag, first_wblock_offset, ns->storage_write_block_size);
+ 			file_size -= unusable_size;
+ 		}
+ 
+@@ -3170,9 +4130,9 @@ check_file_size(as_namespace *ns, uint64_t file_size, const char *tag)
+ 		}
+ 	}
+ 
+-	if (file_size <= DRV_HEADER_SIZE) {
+-		cf_crash(AS_DRV_SSD, "%s size %ld must be greater than header size %d",
+-				tag, file_size, DRV_HEADER_SIZE);
++	if (file_size <= first_wblock_offset) {
++		cf_crash(AS_DRV_SSD, "%s size %ld must be greater than header size %ld",
++				tag, file_size, first_wblock_offset);
+ 	}
+ 
+ 	return file_size;
+@@ -3218,6 +4178,20 @@ ssd_init_devices(as_namespace *ns, drv_ssds **ssds_p)
+ 
+ 		ssd->name = ns->storage_devices[i];
+ 
++		for (const struct ssd_ops **ops = &ssd_backends[0]; *ops; ops++) {
++			if (!strcmp(ns->storage_device_backend, (*ops)->name)) {
++				ssd->ops = *ops;
++				break;
++			}
++		}
++
++		if (!ssd->ops) {
++			cf_crash(AS_DRV_SSD, "Unknown storage device backend: %s", ns->storage_device_backend);
++		} else if (ssd->ops != &ssd_posix_ops) {
++			ssd_init_device(ns, ssd, false);
++			continue;
++		}
++
+ 		// Note - can't configure commit-to-device and disable-odsync.
+ 		ssd->open_flag = O_RDWR | O_DIRECT |
+ 				(ns->storage_disable_odsync ? 0 : O_DSYNC);
+@@ -3233,11 +4207,13 @@ ssd_init_devices(as_namespace *ns, drv_ssds **ssds_p)
+ 
+ 		ioctl(fd, BLKGETSIZE64, &size); // gets the number of bytes
+ 
++		if (ns->storage_filesize)
++			size = MIN(ns->storage_filesize, size);
+ 		ssd->file_size = check_file_size(ns, size, "usable device");
+ 		ssd->io_min_size = find_io_min_size(fd, ssd->name);
+ 
+ 		if (ns->cold_start && ns->storage_cold_start_empty) {
+-			ssd_empty_header(fd, ssd->name);
++			ssd_posix_empty_header(fd, ssd->name);
+ 
+ 			cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s",
+ 					ssd->name);
+@@ -3274,6 +4250,11 @@ ssd_init_shadow_devices(as_namespace *ns, drv_ssds *ssds)
+ 
+ 		ssd->shadow_name = ns->storage_shadows[i];
+ 
++		if (ssd->ops != &ssd_posix_ops) {
++			ssd_init_device(ns, ssd, true);
++			continue;
++		}
++
+ 		int fd = open(ssd->shadow_name, ssd->open_flag, S_IRUSR | S_IWUSR);
+ 
+ 		if (fd == -1) {
+@@ -3293,7 +4274,7 @@ ssd_init_shadow_devices(as_namespace *ns, drv_ssds *ssds)
+ 		ssd->shadow_io_min_size = find_io_min_size(fd, ssd->shadow_name);
+ 
+ 		if (ns->cold_start && ns->storage_cold_start_empty) {
+-			ssd_empty_header(fd, ssd->shadow_name);
++			ssd_posix_empty_header(fd, ssd->shadow_name);
+ 
+ 			cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s",
+ 					ssd->shadow_name);
+@@ -3329,6 +4310,7 @@ ssd_init_files(as_namespace *ns, drv_ssds **ssds_p)
+ 		drv_ssd *ssd = &ssds->ssds[i];
+ 
+ 		ssd->name = ns->storage_devices[i];
++		ssd->ops = &ssd_posix_ops;
+ 
+ 		if (ns->cold_start && ns->storage_cold_start_empty) {
+ 			if (unlink(ssd->name) == 0) {
+@@ -3449,9 +4431,9 @@ ssd_set_pristine_offset(drv_ssds *ssds)
+ 	for (int i = 0; i < ssds->n_ssds; i++) {
+ 		drv_ssd *ssd = &ssds->ssds[i];
+ 
+-		int fd = ssd_fd_get(ssd);
++		ssd_fd_t fd = ssd_fd_get(ssd);
+ 
+-		if (! pread_all(fd, (void *)header_unique, HI_IO_MIN_SIZE, offset)) {
++		if (! ssd_pread_all_bounce(ssd, fd, (void *)header_unique, HI_IO_MIN_SIZE, offset)) {
+ 			cf_crash(AS_DRV_SSD, "%s: read failed: errno %d (%s)",
+ 					ssd->name, errno, cf_strerror(errno));
+ 		}
+@@ -3459,7 +4441,7 @@ ssd_set_pristine_offset(drv_ssds *ssds)
+ 		header_unique->pristine_offset =
+ 				(uint64_t)ssd->pristine_wblock_id * ssd->write_block_size;
+ 
+-		if (! pwrite_all(fd, (void *)header_unique, HI_IO_MIN_SIZE, offset)) {
++		if (! ssd_pwrite_all_bounce(ssd, fd, (void *)header_unique, HI_IO_MIN_SIZE, offset)) {
+ 			cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)",
+ 					ssd->name, errno, cf_strerror(errno));
+ 		}
+@@ -3503,6 +4485,8 @@ as_storage_init_ssd(as_namespace *ns)
+ {
+ 	drv_ssds *ssds;
+ 
++	ssd_backends_init();
++
+ 	if (ns->n_storage_devices != 0) {
+ 		ssd_init_devices(ns, &ssds);
+ 		ssd_init_shadow_devices(ns, ssds);
+@@ -3540,7 +4524,9 @@ as_storage_init_ssd(as_namespace *ns)
+ 	snprintf(histname, sizeof(histname), "{%s}-device-write-size", ns->name);
+ 	ns->device_write_size_hist = histogram_create(histname, HIST_SIZE);
+ 
+-	uint32_t first_wblock_id = DRV_HEADER_SIZE / ns->storage_write_block_size;
++	uint64_t first_wblock_offset = ns->storage_write_block_size <= DRV_HEADER_SIZE ?
++			DRV_HEADER_SIZE : ns->storage_write_block_size;
++	uint32_t first_wblock_id = first_wblock_offset / ns->storage_write_block_size;
+ 
+ 	// Finish initializing drv_ssd structures (non-zero-value members).
+ 	for (int i = 0; i < ssds->n_ssds; i++) {
+@@ -3568,11 +4554,11 @@ as_storage_init_ssd(as_namespace *ns)
+ 
+ 		// Note: free_wblock_q, defrag_wblock_q created after loading devices.
+ 
+-		ssd->fd_q = cf_queue_create(sizeof(int), true);
+-		ssd->fd_cache_q = cf_queue_create(sizeof(int), true);
++		ssd->fd_q = cf_queue_create(sizeof(ssd_fd_t), true);
++		ssd->fd_cache_q = cf_queue_create(sizeof(ssd_fd_t), true);
+ 
+ 		if (ssd->shadow_name) {
+-			ssd->shadow_fd_q = cf_queue_create(sizeof(int), true);
++			ssd->shadow_fd_q = cf_queue_create(sizeof(ssd_fd_t), true);
+ 		}
+ 
+ 		ssd->swb_write_q = cf_queue_create(sizeof(void*), true);
+@@ -3590,17 +4576,17 @@ as_storage_init_ssd(as_namespace *ns)
+ 		}
+ 
+ 		snprintf(histname, sizeof(histname), "{%s}-%s-read", ns->name, ssd->name);
+-		ssd->hist_read = histogram_create(histname, HIST_MILLISECONDS);
++		ssd->hist_read = histogram_create(histname, HIST_MICROSECONDS);
+ 
+ 		snprintf(histname, sizeof(histname), "{%s}-%s-large-block-read", ns->name, ssd->name);
+-		ssd->hist_large_block_read = histogram_create(histname, HIST_MILLISECONDS);
++		ssd->hist_large_block_read = histogram_create(histname, HIST_MICROSECONDS);
+ 
+ 		snprintf(histname, sizeof(histname), "{%s}-%s-write", ns->name, ssd->name);
+-		ssd->hist_write = histogram_create(histname, HIST_MILLISECONDS);
++		ssd->hist_write = histogram_create(histname, HIST_MICROSECONDS);
+ 
+ 		if (ssd->shadow_name) {
+ 			snprintf(histname, sizeof(histname), "{%s}-%s-shadow-write", ns->name, ssd->name);
+-			ssd->hist_shadow_write = histogram_create(histname, HIST_MILLISECONDS);
++			ssd->hist_shadow_write = histogram_create(histname, HIST_MICROSECONDS);
+ 		}
+ 
+ 		ssd_init_commit(ssd);
+@@ -4133,4 +5119,10 @@ as_storage_shutdown_ssd(as_namespace *ns)
+ 
+ 	ssd_set_pristine_offset(ssds);
+ 	ssd_set_trusted(ssds);
++
++	for (int i = 0; i < ssds->n_ssds; i++) {
++		ssd_finish_device(&ssds->ssds[i]);
++	}
++
++	ssd_backends_shutdown();
+ }
+diff --git a/cf/src/Makefile b/cf/src/Makefile
+index c7e0276..5d3fe61 100644
+--- a/cf/src/Makefile
++++ b/cf/src/Makefile
+@@ -28,6 +28,13 @@ INCLUDES += -I$(COMMON)/src/include
+ INCLUDES += -I$(JANSSON)/src
+ INCLUDES += -I$(JEMALLOC)/include
+ 
++ifeq ($(USE_LTHREAD),1)
++  CFLAGS += -DUSE_LTHREAD
++  INCLUDES += -I$(SPDK)/dpdk/build/include
++  INCLUDES += -I$(LTHREAD)/
++  INCLUDES += -I$(LTHREAD)/arch/x86
++endif
++
+ OBJECTS = $(SOURCES:%.c=$(OBJECT_DIR)/%.o)
+ DEPENDENCIES = $(OBJECTS:%.o=%.d)
+ 
+diff --git a/cf/src/cf_mutex.c b/cf/src/cf_mutex.c
+index d249cfd..537c581 100644
+--- a/cf/src/cf_mutex.c
++++ b/cf/src/cf_mutex.c
+@@ -36,6 +36,9 @@
+ 
+ #include "log.h"
+ 
++#ifdef USE_LTHREAD
++#include "lthread_api.h"
++#endif
+ 
+ //==========================================================
+ // Typedefs & constants.
+@@ -57,8 +60,10 @@ sys_futex(void *uaddr, int op, int val)
+ #define xchg(__ptr, __val) __sync_lock_test_and_set(__ptr, __val)
+ #define cmpxchg(__ptr, __cmp, __set) __sync_val_compare_and_swap(__ptr, __cmp, __set)
+ #define cpu_relax() asm volatile("pause\n": : :"memory")
++#ifndef USE_LTHREAD
+ #define unlikely(__expr) __builtin_expect(!! (__expr), 0)
+ #define likely(__expr) __builtin_expect(!! (__expr), 1)
++#endif
+ 
+ 
+ //==========================================================
+@@ -72,6 +77,15 @@ cf_mutex_lock(cf_mutex *m)
+ 		return; // was not locked
+ 	}
+ 
++#ifdef USE_LTHREAD
++	if (lthread_current()) {
++		while (!cf_mutex_trylock(m)) {
++			lthread_yield();
++		}
++		return;
++	}
++#endif
++
+ 	if (m->u32 == 2) {
+ 		sys_futex(m, FUTEX_WAIT_PRIVATE, 2);
+ 	}
+diff --git a/make_in/Makefile.in b/make_in/Makefile.in
+index 3a946bd..567c88d 100644
+--- a/make_in/Makefile.in
++++ b/make_in/Makefile.in
+@@ -31,7 +31,7 @@ SRCDIR =
+ MARCH_NATIVE = $(shell uname -m)
+ 
+ # If GCC v4.4.7 or later, use DWARF version 4, othewise use version 2:
+-ifeq ($(shell $(DEPTH)/build/VersionCheck.py 'gcc -dumpversion' 4.4.7), 1)
++ifeq ($(shell $(DEPTH)/build/VersionCheck.py '$(CC) -dumpversion' 4.4.7), 1)
+   DWARF_VERSION=4
+ else
+   DWARF_VERSION=2
+@@ -67,6 +67,10 @@ endif
+ # O3 also enables -finline-functions, among other things.
+ COMMON_CFLAGS = -gdwarf-$(DWARF_VERSION) -g3 $(OPTFLAGS) -fno-common -fno-strict-aliasing -Wall $(AS_CFLAGS) $(AS_EE_CFLAGS)
+ 
++ifeq ($(shell $(DEPTH)/build/VersionCheck.py '$(CC) -dumpversion' 8), 1)
++  COMMON_CFLAGS += -fno-stack-clash-protection
++endif
++
+ # Code generated for the "nocona" architecture has been determined to run well on a wide variety of current machines.
+ ifneq ($(ARCH),$(filter $(ARCH),ppc64 ppc64le))
+   COMMON_CFLAGS += -march=nocona
+@@ -79,7 +83,7 @@ COMMON_CFLAGS += -MMD
+ COMMON_CFLAGS += -Werror
+ 
+ # Override certain warnings under GCC v9+.
+-ifeq ($(shell $(DEPTH)/build/VersionCheck.py 'gcc -dumpversion' 9), 1)
++ifeq ($(shell $(DEPTH)/build/VersionCheck.py '$(CC) -dumpversion' 9), 1)
+   # Disable compilation failure due to warnings about possibly unaligned pointers into packed structs.
+   COMMON_CFLAGS += -Wno-address-of-packed-member
+ endif
+diff --git a/make_in/Makefile.vars b/make_in/Makefile.vars
+index 8a0ce0b..09b2066 100644
+--- a/make_in/Makefile.vars
++++ b/make_in/Makefile.vars
+@@ -42,6 +42,9 @@ LD_LUAJIT = static
+ # Default mode used for linking the Lua library:
+ LD_LUA = static
+ 
++# Default mode used for linking the SPDK library:
++LD_SPDK = static
++
+ # Options to pass to Jansson's "configure" script.
+ JANSSON_CONFIG_OPT =
+ 
+@@ -80,6 +83,8 @@ MOD_LUA_PATH  := $(realpath $(DEPTH)/modules/mod-lua)
+ JEMALLOC_PATH := $(realpath $(DEPTH)/modules/jemalloc)
+ LUAJIT_PATH   := $(realpath $(DEPTH)/modules/luajit)
+ S2_PATH       := $(realpath $(DEPTH)/modules/s2-geometry-library/geometry)
++SPDK_PATH     := $(realpath $(DEPTH)/modules/spdk)
++LTHREAD_PATH  := $(realpath $(DEPTH)/modules/lthread)
+ 
+ # Overridable values used by sub-makefiles:
+ AI       = $(AI_PATH)
+@@ -91,6 +96,8 @@ MOD_LUA  = $(MOD_LUA_PATH)
+ JEMALLOC = $(JEMALLOC_PATH)
+ LUAJIT   = $(LUAJIT_PATH)
+ S2       = $(S2_PATH)
++SPDK     = $(SPDK_PATH)
++LTHREAD  = $(LTHREAD_PATH)
+ 
+ # Programs, for which GNU Make doesn't define implicit variables:
+ OBJCOPY  := objcopy