Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adapt to VFIO live migration v2 #782

Merged
merged 2 commits into from
Sep 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
836 changes: 463 additions & 373 deletions docs/vfio-user.rst

Large diffs are not rendered by default.

162 changes: 13 additions & 149 deletions include/libvfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -583,21 +583,8 @@ typedef enum {
VFU_MIGR_STATE_RESUME
} vfu_migr_state_t;

#define VFU_MIGR_CALLBACKS_VERS 1
#define VFU_MIGR_CALLBACKS_VERS 2

/*
* Callbacks during the pre-copy and stop-and-copy phases.
*
* The client executes the following steps to copy migration data:
*
* 1. get_pending_bytes: device must return amount of migration data
* 2. prepare_data: device must prepare migration data
* 3. read_data: device must provide migration data
*
* The client repeats the above steps until there is no more migration data to
* return (the device must return 0 from get_pending_bytes to indicate that
* there are no more migration data to be consumed in this iteration).
*/
typedef struct {

/*
Expand All @@ -615,152 +602,30 @@ typedef struct {
* FIXME maybe we should create a single callback and pass the state?
*/
int (*transition)(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state);

/* Callbacks for saving device state */

/*
* Function that is called to retrieve the amount of pending migration
* data. If migration data were previously made available (function
* prepare_data has been called) then calling this function signifies that
* they have been read (e.g. migration data can be discarded). If the
* function returns 0 then migration has finished and this function won't
* be called again.
*
* The amount of pending migration data returned by the device does not
* necessarily have to monotonically decrease over time and does not need
* to match the amount of migration data returned via the @size argument in
* prepare_data. It can completely fluctuate according to the needs of the
* device. These semantics are derived from the pending_bytes register in
* VFIO. Therefore the value returned by get_pending_bytes must be
* primarily regarded as boolean, either 0 or non-zero, as far as migration
* completion is concerned. More advanced vfio-user clients can make
* assumptions on how migration is progressing on devices that guarantee
* that the amount of pending migration data decreases over time.
*/
uint64_t (*get_pending_bytes)(vfu_ctx_t *vfu_ctx);

/*
* Function that is called to instruct the device to prepare migration data
* to be read when in pre-copy or stop-and-copy state, and to prepare for
* receiving migration data when in resuming state.
*
* When in pre-copy and stop-and-copy state, the function must return only
* after migration data are available at the specified offset. This
* callback is called once per iteration. The amount of data available
* pointed to by @size can be different that the amount of data returned by
* get_pending_bytes in the beginning of the iteration.
*
* In VFIO, the data_offset and data_size registers can be read multiple
* times during an iteration and are invariant, libvfio-user simplifies
* this by caching the values and returning them when read, guaranteeing
* that prepare_data() is called only once per migration iteration.
*
* When in resuming state, @offset must be set to where migration data must
* written. @size points to NULL.
*
* The callback should return -1 on error, setting errno.
*/
int (*prepare_data)(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size);


/*
* Function that is called to read migration data. offset and size can be
* any subrange on the offset and size previously returned by prepare_data.
* The function must return the amount of data read or -1 on error, setting
* errno.
* Function that is called to read `count` bytes of migration data into
* `buf`. The function must return the amount of data read or -1 on error,
* setting errno. The function may return less data than requested.
*
* This function can be called even if the migration data can be memory
* mapped.
* If the function returns zero, this is interpreted to mean that there is
* no more migration data to read.
*/
ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf,
uint64_t count, uint64_t offset);

/* Callbacks for restoring device state */
ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);

/*
* Fuction that is called for writing previously stored device state. The
* Function that is called for writing previously stored device state. The
* function must return the amount of data written or -1 on error, setting
* errno.
*/
ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count,
uint64_t offset);

/*
* Function that is called when client has written some previously stored
* device state.
*
* The callback should return -1 on error, setting errno.
* errno. Partial writes are not supported, so any return value other than
* `count` is invalid.
*/
int (*data_written)(vfu_ctx_t *vfu_ctx, uint64_t count);
ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count);

} vfu_migration_callbacks_t;

/**
* The definition for VFIO_DEVICE_STATE_XXX differs with the version of vfio
* header file used. Some old systems wouldn't have these definitions. Some
* other newer systems would be using region based migration, and not
* have VFIO_DEVICE_STATE_V1_XXXX defined. The latest ones have
* VFIO_DEVICE_STATE_V1_XXXX defined. The following addresses all
* these scenarios.
*/
#if defined(VFIO_DEVICE_STATE_STOP)

_Static_assert(VFIO_DEVICE_STATE_STOP == 0,
"incompatible VFIO_DEVICE_STATE_STOP definition");

#define VFIO_DEVICE_STATE_V1_STOP VFIO_DEVICE_STATE_STOP
#define VFIO_DEVICE_STATE_V1_RUNNING VFIO_DEVICE_STATE_RUNNING
#define VFIO_DEVICE_STATE_V1_SAVING VFIO_DEVICE_STATE_SAVING
#define VFIO_DEVICE_STATE_V1_RESUMING VFIO_DEVICE_STATE_RESUMING

#elif !defined(VFIO_REGION_TYPE_MIGRATION_DEPRECATED) /* VFIO_DEVICE_STATE_STOP */

#define VFIO_DEVICE_STATE_V1_STOP (0)
#define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0)
#define VFIO_DEVICE_STATE_V1_SAVING (1 << 1)
#define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2)
#define VFIO_DEVICE_STATE_MASK ((1 << 3) - 1)

#endif /* VFIO_REGION_TYPE_MIGRATION_DEPRECATED */

/*
* The currently defined migration registers; if using migration callbacks,
* these are handled internally by the library.
*
* This is analogous to struct vfio_device_migration_info.
*/
struct vfio_user_migration_info {
/* VFIO_DEVICE_STATE_* */
uint32_t device_state;
uint32_t reserved;
uint64_t pending_bytes;
uint64_t data_offset;
uint64_t data_size;
};

/*
* Returns the size of the area needed to hold the migration registers at the
* beginning of the migration region; guaranteed to be page aligned.
*/
size_t
vfu_get_migr_register_area_size(void);

/**
* vfu_setup_device_migration provides an abstraction over the migration
* protocol: the user specifies a set of callbacks which are called in response
* to client accesses of the migration region; the migration region read/write
* callbacks are not called after this function call. Offsets in callbacks are
* relative to @data_offset.
*
* @vfu_ctx: the libvfio-user context
* @callbacks: migration callbacks
* @data_offset: offset in the migration region where data begins.
*
* @returns 0 on success, -1 on error, sets errno.
*/
int
vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx,
const vfu_migration_callbacks_t *callbacks,
uint64_t data_offset);
const vfu_migration_callbacks_t *callbacks);

/**
* Triggers an interrupt.
Expand Down Expand Up @@ -906,7 +771,6 @@ enum {
VFU_PCI_DEV_ROM_REGION_IDX,
VFU_PCI_DEV_CFG_REGION_IDX,
VFU_PCI_DEV_VGA_REGION_IDX,
VFU_PCI_DEV_MIGR_REGION_IDX,
VFU_PCI_DEV_NUM_REGIONS,
};

Expand Down
105 changes: 87 additions & 18 deletions include/vfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ enum vfio_user_command {
VFIO_USER_DMA_READ = 11,
VFIO_USER_DMA_WRITE = 12,
VFIO_USER_DEVICE_RESET = 13,
VFIO_USER_DIRTY_PAGES = 14,
VFIO_USER_REGION_WRITE_MULTI = 15,
VFIO_USER_DEVICE_FEATURE = 16,
VFIO_USER_MIG_DATA_READ = 17,
VFIO_USER_MIG_DATA_WRITE = 18,
VFIO_USER_MAX,
};

Expand Down Expand Up @@ -200,31 +203,97 @@ typedef struct vfio_user_region_io_fds_reply {
} sub_regions[];
} __attribute__((packed)) vfio_user_region_io_fds_reply_t;

/* Analogous to struct vfio_device_feature_dma_logging_range */
struct vfio_user_device_feature_dma_logging_range {
uint64_t iova;
uint64_t length;
} __attribute__((packed));

/* Analogous to vfio_iommu_type1_dirty_bitmap. */
struct vfio_user_dirty_pages {
uint32_t argsz;
#ifndef VFIO_IOMMU_DIRTY_PAGES_FLAG_START
#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
#endif
uint32_t flags;
/* Analogous to struct vfio_device_feature_dma_logging_control */
struct vfio_user_device_feature_dma_logging_control {
uint64_t page_size;
uint32_t num_ranges;
uint32_t reserved;
struct vfio_user_device_feature_dma_logging_range ranges[];
} __attribute__((packed));

/* Analogous to struct vfio_iommu_type1_dirty_bitmap_get. */
struct vfio_user_bitmap_range {
/* Analogous to struct vfio_device_feature_dma_logging_report */
struct vfio_user_device_feature_dma_logging_report {
uint64_t iova;
uint64_t size;
struct vfio_user_bitmap bitmap;
uint64_t length;
uint64_t page_size;
uint8_t bitmap[];
} __attribute__((packed));

#ifndef VFIO_DEVICE_FEATURE_DMA_LOGGING_START
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8
#endif

/* Analogous to struct vfio_device_feature */
struct vfio_user_device_feature {
uint32_t argsz;
uint32_t flags;
#ifndef VFIO_DEVICE_FEATURE_MASK
#define VFIO_DEVICE_FEATURE_MASK (0xffff) /* 16-bit feature index */
#define VFIO_DEVICE_FEATURE_GET (1 << 16) /* Get feature into data[] */
#define VFIO_DEVICE_FEATURE_SET (1 << 17) /* Set feature from data[] */
#define VFIO_DEVICE_FEATURE_PROBE (1 << 18) /* Probe feature support */
#endif
uint8_t data[];
} __attribute__((packed));

/* Analogous to struct vfio_device_feature_migration */
struct vfio_user_device_feature_migration {
uint64_t flags;
#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED
#define VFIO_MIGRATION_STOP_COPY (1 << 0)
#define VFIO_MIGRATION_P2P (1 << 1)
#endif
/*
* PRE_COPY was added in a later kernel version, after
* VFIO_REGION_TYPE_MIGRATION_DEPRECATED had been introduced.
*/
#ifndef VFIO_MIGRATION_PRE_COPY
#define VFIO_MIGRATION_PRE_COPY (1 << 2)
#endif
} __attribute__((packed));
#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED
#define VFIO_DEVICE_FEATURE_MIGRATION 1
#endif
_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8,
"bad vfio_user_device_feature_migration size");

#ifndef VFIO_REGION_TYPE_MIGRATION
/* Analogous to struct vfio_device_feature_mig_state */
struct vfio_user_device_feature_mig_state {
uint32_t device_state;
uint32_t data_fd;
} __attribute__((packed));
#ifndef VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2
#endif
_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8,
"bad vfio_user_device_feature_mig_state size");

#define VFIO_REGION_TYPE_MIGRATION (3)
#define VFIO_REGION_SUBTYPE_MIGRATION (1)
/* Analogous to enum vfio_device_mig_state */
enum vfio_user_device_mig_state {
VFIO_USER_DEVICE_STATE_ERROR = 0,
VFIO_USER_DEVICE_STATE_STOP = 1,
VFIO_USER_DEVICE_STATE_RUNNING = 2,
VFIO_USER_DEVICE_STATE_STOP_COPY = 3,
VFIO_USER_DEVICE_STATE_RESUMING = 4,
VFIO_USER_DEVICE_STATE_RUNNING_P2P = 5,
VFIO_USER_DEVICE_STATE_PRE_COPY = 6,
VFIO_USER_DEVICE_STATE_PRE_COPY_P2P = 7,
VFIO_USER_DEVICE_NUM_STATES = 8,
};

#endif /* VFIO_REGION_TYPE_MIGRATION */
struct vfio_user_mig_data {
uint32_t argsz;
uint32_t size;
uint8_t data[];
} __attribute__((packed));

#ifdef __cplusplus
}
Expand Down
41 changes: 38 additions & 3 deletions lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <limits.h>
#include <stdint.h>
#include <unistd.h>
#include <sys/uio.h>

#define UNUSED __attribute__((unused))
#define EXPORT __attribute__((visibility("default")))
Expand All @@ -62,6 +63,20 @@

typedef unsigned long long ull_t;

static inline int
ERROR_INT(int err)
{
errno = err;
return -1;
}

static inline void *
ERROR_PTR(int err)
{
errno = err;
return NULL;
}

/* Saturating uint64_t addition. */
static inline uint64_t
satadd_u64(uint64_t a, uint64_t b)
Expand All @@ -73,11 +88,21 @@ satadd_u64(uint64_t a, uint64_t b)
/*
* The size, in bytes, of the bitmap that represents the given range with the
* given page size.
*
* Returns -1 and sets errno if the given page size is invalid for the given
* range.
*/
static inline size_t
_get_bitmap_size(size_t size, size_t pgsize)
static inline ssize_t
get_bitmap_size(size_t region_size, size_t pgsize)
{
size_t nr_pages = (size / pgsize) + (size % pgsize != 0);
if (pgsize == 0) {
return ERROR_INT(EINVAL);
}
if (region_size < pgsize) {
return ERROR_INT(EINVAL);
}

size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
return ROUND_UP(nr_pages, sizeof(uint64_t) * CHAR_BIT) / CHAR_BIT;
}

Expand Down Expand Up @@ -107,6 +132,16 @@ close_safely(int *fd)
errno = saved_errno;
}

static inline void
iov_free(struct iovec *iov)
{
if (iov->iov_base != NULL) {
free(iov->iov_base);
iov->iov_base = NULL;
}
iov->iov_len = 0;
}

#ifdef UNIT_TEST

#define MOCK_DEFINE(f) \
Expand Down
Loading