Skip to content

Commit

Permalink
adapt to VFIO live migration v2 (#782)
Browse files Browse the repository at this point in the history
This commit adapts the vfio-user protocol specification and the libvfio-user
implementation to v2 of the VFIO live migration interface, as used in the kernel
and QEMU.

The differences between v1 and v2 are discussed in this email thread [1], and we
slightly differ from upstream VFIO v2 in that instead of transferring data over
a new FD, we use the existing UNIX socket with new commands
VFIO_USER_MIG_DATA_READ/WRITE. We also don't yet use P2P states.

The updated spec was submitted to qemu-devel [2].

[1] https://lore.kernel.org/all/20220130160826.32449-9-yishaih@nvidia.com/
[2] https://lore.kernel.org/all/20230718094150.110183-1-william.henderson@nutanix.com/

Signed-off-by: William Henderson <william.henderson@nutanix.com>
  • Loading branch information
w-henderson authored Sep 15, 2023
1 parent 1569a37 commit 190f85b
Show file tree
Hide file tree
Showing 25 changed files with 2,497 additions and 2,357 deletions.
836 changes: 463 additions & 373 deletions docs/vfio-user.rst

Large diffs are not rendered by default.

162 changes: 13 additions & 149 deletions include/libvfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -583,21 +583,8 @@ typedef enum {
VFU_MIGR_STATE_RESUME
} vfu_migr_state_t;

#define VFU_MIGR_CALLBACKS_VERS 1
#define VFU_MIGR_CALLBACKS_VERS 2

/*
* Callbacks during the pre-copy and stop-and-copy phases.
*
* The client executes the following steps to copy migration data:
*
* 1. get_pending_bytes: device must return amount of migration data
* 2. prepare_data: device must prepare migration data
* 3. read_data: device must provide migration data
*
* The client repeats the above steps until there is no more migration data to
* return (the device must return 0 from get_pending_bytes to indicate that
* there are no more migration data to be consumed in this iteration).
*/
typedef struct {

/*
Expand All @@ -615,152 +602,30 @@ typedef struct {
* FIXME maybe we should create a single callback and pass the state?
*/
int (*transition)(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state);

/* Callbacks for saving device state */

/*
* Function that is called to retrieve the amount of pending migration
* data. If migration data were previously made available (function
* prepare_data has been called) then calling this function signifies that
* they have been read (e.g. migration data can be discarded). If the
* function returns 0 then migration has finished and this function won't
* be called again.
*
* The amount of pending migration data returned by the device does not
* necessarily have to monotonically decrease over time and does not need
* to match the amount of migration data returned via the @size argument in
* prepare_data. It can completely fluctuate according to the needs of the
* device. These semantics are derived from the pending_bytes register in
* VFIO. Therefore the value returned by get_pending_bytes must be
* primarily regarded as boolean, either 0 or non-zero, as far as migration
* completion is concerned. More advanced vfio-user clients can make
* assumptions on how migration is progressing on devices that guarantee
* that the amount of pending migration data decreases over time.
*/
uint64_t (*get_pending_bytes)(vfu_ctx_t *vfu_ctx);

/*
* Function that is called to instruct the device to prepare migration data
* to be read when in pre-copy or stop-and-copy state, and to prepare for
* receiving migration data when in resuming state.
*
* When in pre-copy and stop-and-copy state, the function must return only
* after migration data are available at the specified offset. This
* callback is called once per iteration. The amount of data available
* pointed to by @size can be different that the amount of data returned by
* get_pending_bytes in the beginning of the iteration.
*
* In VFIO, the data_offset and data_size registers can be read multiple
* times during an iteration and are invariant, libvfio-user simplifies
* this by caching the values and returning them when read, guaranteeing
* that prepare_data() is called only once per migration iteration.
*
* When in resuming state, @offset must be set to where migration data must
* written. @size points to NULL.
*
* The callback should return -1 on error, setting errno.
*/
int (*prepare_data)(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size);


/*
* Function that is called to read migration data. offset and size can be
* any subrange on the offset and size previously returned by prepare_data.
* The function must return the amount of data read or -1 on error, setting
* errno.
* Function that is called to read `count` bytes of migration data into
* `buf`. The function must return the amount of data read or -1 on error,
* setting errno. The function may return less data than requested.
*
* This function can be called even if the migration data can be memory
* mapped.
* If the function returns zero, this is interpreted to mean that there is
* no more migration data to read.
*/
ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf,
uint64_t count, uint64_t offset);

/* Callbacks for restoring device state */
ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);

/*
* Fuction that is called for writing previously stored device state. The
* Function that is called for writing previously stored device state. The
* function must return the amount of data written or -1 on error, setting
* errno.
*/
ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count,
uint64_t offset);

/*
* Function that is called when client has written some previously stored
* device state.
*
* The callback should return -1 on error, setting errno.
* errno. Partial writes are not supported, so any return value other than
* `count` is invalid.
*/
int (*data_written)(vfu_ctx_t *vfu_ctx, uint64_t count);
ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);

} vfu_migration_callbacks_t;

/**
* The definition for VFIO_DEVICE_STATE_XXX differs with the version of vfio
* header file used. Some old systems wouldn't have these definitions. Some
* other newer systems would be using region based migration, and not
* have VFIO_DEVICE_STATE_V1_XXXX defined. The latest ones have
* VFIO_DEVICE_STATE_V1_XXXX defined. The following addresses all
* these scenarios.
*/
#if defined(VFIO_DEVICE_STATE_STOP)

_Static_assert(VFIO_DEVICE_STATE_STOP == 0,
"incompatible VFIO_DEVICE_STATE_STOP definition");

#define VFIO_DEVICE_STATE_V1_STOP VFIO_DEVICE_STATE_STOP
#define VFIO_DEVICE_STATE_V1_RUNNING VFIO_DEVICE_STATE_RUNNING
#define VFIO_DEVICE_STATE_V1_SAVING VFIO_DEVICE_STATE_SAVING
#define VFIO_DEVICE_STATE_V1_RESUMING VFIO_DEVICE_STATE_RESUMING

#elif !defined(VFIO_REGION_TYPE_MIGRATION_DEPRECATED) /* VFIO_DEVICE_STATE_STOP */

#define VFIO_DEVICE_STATE_V1_STOP (0)
#define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0)
#define VFIO_DEVICE_STATE_V1_SAVING (1 << 1)
#define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2)
#define VFIO_DEVICE_STATE_MASK ((1 << 3) - 1)

#endif /* VFIO_REGION_TYPE_MIGRATION_DEPRECATED */

/*
* The currently defined migration registers; if using migration callbacks,
* these are handled internally by the library.
*
* This is analogous to struct vfio_device_migration_info.
*/
struct vfio_user_migration_info {
/* VFIO_DEVICE_STATE_* */
uint32_t device_state;
uint32_t reserved;
uint64_t pending_bytes;
uint64_t data_offset;
uint64_t data_size;
};

/*
* Returns the size of the area needed to hold the migration registers at the
* beginning of the migration region; guaranteed to be page aligned.
*/
size_t
vfu_get_migr_register_area_size(void);

/**
* vfu_setup_device_migration provides an abstraction over the migration
* protocol: the user specifies a set of callbacks which are called in response
* to client accesses of the migration region; the migration region read/write
* callbacks are not called after this function call. Offsets in callbacks are
* relative to @data_offset.
*
* @vfu_ctx: the libvfio-user context
* @callbacks: migration callbacks
* @data_offset: offset in the migration region where data begins.
*
* @returns 0 on success, -1 on error, sets errno.
*/
int
vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx,
const vfu_migration_callbacks_t *callbacks,
uint64_t data_offset);
const vfu_migration_callbacks_t *callbacks);

/**
* Triggers an interrupt.
Expand Down Expand Up @@ -906,7 +771,6 @@ enum {
VFU_PCI_DEV_ROM_REGION_IDX,
VFU_PCI_DEV_CFG_REGION_IDX,
VFU_PCI_DEV_VGA_REGION_IDX,
VFU_PCI_DEV_MIGR_REGION_IDX,
VFU_PCI_DEV_NUM_REGIONS,
};

Expand Down
105 changes: 87 additions & 18 deletions include/vfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ enum vfio_user_command {
VFIO_USER_DMA_READ = 11,
VFIO_USER_DMA_WRITE = 12,
VFIO_USER_DEVICE_RESET = 13,
VFIO_USER_DIRTY_PAGES = 14,
VFIO_USER_REGION_WRITE_MULTI = 15,
VFIO_USER_DEVICE_FEATURE = 16,
VFIO_USER_MIG_DATA_READ = 17,
VFIO_USER_MIG_DATA_WRITE = 18,
VFIO_USER_MAX,
};

Expand Down Expand Up @@ -200,31 +203,97 @@ typedef struct vfio_user_region_io_fds_reply {
} sub_regions[];
} __attribute__((packed)) vfio_user_region_io_fds_reply_t;

/* Analogous to struct vfio_device_feature_dma_logging_range */
struct vfio_user_device_feature_dma_logging_range {
uint64_t iova;
uint64_t length;
} __attribute__((packed));

/* Analogous to vfio_iommu_type1_dirty_bitmap. */
struct vfio_user_dirty_pages {
uint32_t argsz;
#ifndef VFIO_IOMMU_DIRTY_PAGES_FLAG_START
#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
#endif
uint32_t flags;
/* Analogous to struct vfio_device_feature_dma_logging_control */
struct vfio_user_device_feature_dma_logging_control {
uint64_t page_size;
uint32_t num_ranges;
uint32_t reserved;
struct vfio_user_device_feature_dma_logging_range ranges[];
} __attribute__((packed));

/* Analogous to struct vfio_iommu_type1_dirty_bitmap_get. */
struct vfio_user_bitmap_range {
/* Analogous to struct vfio_device_feature_dma_logging_report */
struct vfio_user_device_feature_dma_logging_report {
uint64_t iova;
uint64_t size;
struct vfio_user_bitmap bitmap;
uint64_t length;
uint64_t page_size;
uint8_t bitmap[];
} __attribute__((packed));

#ifndef VFIO_DEVICE_FEATURE_DMA_LOGGING_START
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8
#endif

/* Analogous to struct vfio_device_feature */
struct vfio_user_device_feature {
uint32_t argsz;
uint32_t flags;
#ifndef VFIO_DEVICE_FEATURE_MASK
#define VFIO_DEVICE_FEATURE_MASK (0xffff) /* 16-bit feature index */
#define VFIO_DEVICE_FEATURE_GET (1 << 16) /* Get feature into data[] */
#define VFIO_DEVICE_FEATURE_SET (1 << 17) /* Set feature from data[] */
#define VFIO_DEVICE_FEATURE_PROBE (1 << 18) /* Probe feature support */
#endif
uint8_t data[];
} __attribute__((packed));

/* Analogous to struct vfio_device_feature_migration */
struct vfio_user_device_feature_migration {
uint64_t flags;
#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED
#define VFIO_MIGRATION_STOP_COPY (1 << 0)
#define VFIO_MIGRATION_P2P (1 << 1)
#endif
/*
* PRE_COPY was added in a later kernel version, after
* VFIO_REGION_TYPE_MIGRATION_DEPRECATED had been introduced.
*/
#ifndef VFIO_MIGRATION_PRE_COPY
#define VFIO_MIGRATION_PRE_COPY (1 << 2)
#endif
} __attribute__((packed));
#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED
#define VFIO_DEVICE_FEATURE_MIGRATION 1
#endif
_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8,
"bad vfio_user_device_feature_migration size");

#ifndef VFIO_REGION_TYPE_MIGRATION
/* Analogous to struct vfio_device_feature_mig_state */
struct vfio_user_device_feature_mig_state {
uint32_t device_state;
uint32_t data_fd;
} __attribute__((packed));
#ifndef VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2
#endif
_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8,
"bad vfio_user_device_feature_mig_state size");

#define VFIO_REGION_TYPE_MIGRATION (3)
#define VFIO_REGION_SUBTYPE_MIGRATION (1)
/* Analogous to enum vfio_device_mig_state */
enum vfio_user_device_mig_state {
VFIO_USER_DEVICE_STATE_ERROR = 0,
VFIO_USER_DEVICE_STATE_STOP = 1,
VFIO_USER_DEVICE_STATE_RUNNING = 2,
VFIO_USER_DEVICE_STATE_STOP_COPY = 3,
VFIO_USER_DEVICE_STATE_RESUMING = 4,
VFIO_USER_DEVICE_STATE_RUNNING_P2P = 5,
VFIO_USER_DEVICE_STATE_PRE_COPY = 6,
VFIO_USER_DEVICE_STATE_PRE_COPY_P2P = 7,
VFIO_USER_DEVICE_NUM_STATES = 8,
};

#endif /* VFIO_REGION_TYPE_MIGRATION */
struct vfio_user_mig_data {
uint32_t argsz;
uint32_t size;
uint8_t data[];
} __attribute__((packed));

#ifdef __cplusplus
}
Expand Down
41 changes: 38 additions & 3 deletions lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <limits.h>
#include <stdint.h>
#include <unistd.h>
#include <sys/uio.h>

#define UNUSED __attribute__((unused))
#define EXPORT __attribute__((visibility("default")))
Expand All @@ -62,6 +63,20 @@

typedef unsigned long long ull_t;

static inline int
ERROR_INT(int err)
{
errno = err;
return -1;
}

static inline void *
ERROR_PTR(int err)
{
errno = err;
return NULL;
}

/* Saturating uint64_t addition. */
static inline uint64_t
satadd_u64(uint64_t a, uint64_t b)
Expand All @@ -73,11 +88,21 @@ satadd_u64(uint64_t a, uint64_t b)
/*
* The size, in bytes, of the bitmap that represents the given range with the
* given page size.
*
* Returns -1 and sets errno if the given page size is invalid for the given
* range.
*/
static inline size_t
_get_bitmap_size(size_t size, size_t pgsize)
static inline ssize_t
get_bitmap_size(size_t region_size, size_t pgsize)
{
size_t nr_pages = (size / pgsize) + (size % pgsize != 0);
if (pgsize == 0) {
return ERROR_INT(EINVAL);
}
if (region_size < pgsize) {
return ERROR_INT(EINVAL);
}

size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
return ROUND_UP(nr_pages, sizeof(uint64_t) * CHAR_BIT) / CHAR_BIT;
}

Expand Down Expand Up @@ -107,6 +132,16 @@ close_safely(int *fd)
errno = saved_errno;
}

static inline void
iov_free(struct iovec *iov)
{
if (iov->iov_base != NULL) {
free(iov->iov_base);
iov->iov_base = NULL;
}
iov->iov_len = 0;
}

#ifdef UNIT_TEST

#define MOCK_DEFINE(f) \
Expand Down
Loading

0 comments on commit 190f85b

Please sign in to comment.