diff --git a/docs/vfio-user.rst b/docs/vfio-user.rst index 3c26da53..b83b359d 100644 --- a/docs/vfio-user.rst +++ b/docs/vfio-user.rst @@ -1,11 +1,10 @@ .. include:: - ******************************** vfio-user Protocol Specification ******************************** -------------- -Version_ 0.9.1 +Version_ 0.9.2 -------------- .. contents:: Table of Contents @@ -342,9 +341,9 @@ usual ``msg_size`` field in the header, not the ``argsz`` field. In a reply, the server sets ``argsz`` field to the size needed for a full payload size. This may be less than the requested maximum size. This may be -larger than the requested maximum size: in that case, the payload reply header -is returned, but the ``argsz`` field in the reply indicates the needed size, -allowing a client to allocate a larger buffer for holding the reply before +larger than the requested maximum size: in that case, the full payload is not +included in the reply, but the ``argsz`` field in the reply indicates the needed +size, allowing a client to allocate a larger buffer for holding the reply before trying again. In addition, during negotiation (see `Version`_), the client and server may @@ -357,8 +356,9 @@ Protocol Specification To distinguish from the base VFIO symbols, all vfio-user symbols are prefixed with ``vfio_user`` or ``VFIO_USER``. In this revision, all data is in the -little-endian format, although this may be relaxed in future revisions in cases -where the client and server are both big-endian. +endianness of the host system, although this may be relaxed in future +revisions in cases where the client and server run on different hosts +with different endianness. Unless otherwise specified, all sizes should be presumed to be in bytes. @@ -385,7 +385,10 @@ Name Command Request Direction ``VFIO_USER_DMA_READ`` 11 server -> client ``VFIO_USER_DMA_WRITE`` 12 server -> client ``VFIO_USER_DEVICE_RESET`` 13 client -> server -``VFIO_USER_DIRTY_PAGES`` 14 client -> server +``VFIO_USER_REGION_WRITE_MULTI`` 15 client -> server +``VFIO_USER_DEVICE_FEATURE`` 16 client -> server +``VFIO_USER_MIG_DATA_READ`` 17 client -> server +``VFIO_USER_MIG_DATA_WRITE`` 18 client -> server ====================================== ========= ================= Header @@ -508,34 +511,33 @@ format: Capabilities: -+--------------------+--------+------------------------------------------------+ -| Name | Type | Description | -+====================+========+================================================+ -| max_msg_fds | number | Maximum number of file descriptors that can be | -| | | received by the sender in one message. | -| | | Optional. If not specified then the receiver | -| | | must assume a value of ``1``. | -+--------------------+--------+------------------------------------------------+ -| max_data_xfer_size | number | Maximum ``count`` for data transfer messages; | -| | | see `Read and Write Operations`_. Optional, | -| | | with a default value of 1048576 bytes. | -+--------------------+--------+------------------------------------------------+ -| migration | object | Migration capability parameters. If missing | -| | | then migration is not supported by the sender. | -+--------------------+--------+------------------------------------------------+ -| twin_socket | object | Parameters for twin-socket mode, which handles | -| | | server-to-client commands and their replies on | -| | | a separate socket. Optional. | -+--------------------+--------+------------------------------------------------+ - -The migration capability contains the following name/value pairs: - -+--------+--------+-----------------------------------------------+ -| Name | Type | Description | -+========+========+===============================================+ -| pgsize | number | Page size of dirty pages bitmap. The smallest | -| | | between the client and the server is used. | -+--------+--------+-----------------------------------------------+ ++--------------------+---------+-----------------------------------------------+ +| Name | Type | Description | ++====================+=========+===============================================+ +| max_msg_fds | number | Maximum number of file descriptors that can | +| | | be received by the sender in one message. | +| | | Optional. If not specified then the receiver | +| | | must assume a value of ``1``. | ++--------------------+---------+-----------------------------------------------+ +| max_data_xfer_size | number | Maximum ``count`` for data transfer messages; | +| | | see `Read and Write Operations`_. Optional, | +| | | with a default value of 1048576 bytes. | ++--------------------+---------+-----------------------------------------------+ +| max_dma_maps | number | Maximum number DMA map windows that can be | +| | | valid simultaneously. Optional, with a | +| | | value of 65535 (64k-1). | ++--------------------+---------+-----------------------------------------------+ +| pgsizes | number | Page sizes supported in DMA map operations | +| | | or'ed together. Optional, with a default | +| | | value of supporting only 4k pages. | ++--------------------+---------+-----------------------------------------------+ +| twin_socket | object | Parameters for twin-socket mode, which | +| | | handles server-to-client commands and their | +| | | replies on a separate socket. Optional. | ++--------------------+---------+-----------------------------------------------+ +| write_multiple | boolean | ``VFIO_USER_REGION_WRITE_MULTI`` messages | +| | | are supported if the value is ``true``. | ++--------------------+---------+-----------------------------------------------+ The ``twin_socket`` capability object holds these name/value pairs: @@ -678,56 +680,18 @@ The request payload for this message is a structure of the following format: +--------------+--------+------------------------+ | flags | 4 | 4 | +--------------+--------+------------------------+ -| | +-----+-----------------------+ | -| | | Bit | Definition | | -| | +=====+=======================+ | -| | | 0 | get dirty page bitmap | | -| | +-----+-----------------------+ | -| | | 1 | unmap all regions | | -| | +-----+-----------------------+ | -+--------------+--------+------------------------+ | address | 8 | 8 | +--------------+--------+------------------------+ | size | 16 | 8 | +--------------+--------+------------------------+ * *argsz* is the maximum size of the reply payload. -* *flags* contains the following DMA region attributes: - - * *get dirty page bitmap* indicates that a dirty page bitmap must be - populated before unmapping the DMA region. The client must provide a - `VFIO Bitmap`_ structure, explained below, immediately following this - entry. - * *unmap all regions* indicates to unmap all the regions previously - mapped via `VFIO_USER_DMA_MAP`. This flag cannot be combined with - *get dirty page bitmap* and expects *address* and *size* to be 0. - +* *flags* is unused in this version. * *address* is the base DMA address of the DMA region. * *size* is the size of the DMA region. The address and size of the DMA region being unmapped must match exactly a -previous mapping. The size of request message depends on whether or not the -*get dirty page bitmap* bit is set in Flags: - -* If not set, the size of the total request message is: 16 + 24. - -* If set, the size of the total request message is: 16 + 24 + 16. - -.. _VFIO Bitmap: - -VFIO Bitmap Format -"""""""""""""""""" - -+--------+--------+------+ -| Name | Offset | Size | -+========+========+======+ -| pgsize | 0 | 8 | -+--------+--------+------+ -| size | 8 | 8 | -+--------+--------+------+ - -* *pgsize* is the page size for the bitmap, in bytes. -* *size* is the size for the bitmap, in bytes, excluding the VFIO bitmap header. +previous mapping. Reply ^^^^^ @@ -736,14 +700,8 @@ Upon receiving a ``VFIO_USER_DMA_UNMAP`` command, if the file descriptor is mapped then the server must release all references to that DMA region before replying, which potentially includes in-flight DMA transactions. -The server responds with the original DMA entry in the request. If the -*get dirty page bitmap* bit is set in flags in the request, then -the server also includes the `VFIO Bitmap`_ structure sent in the request, -followed by the corresponding dirty page bitmap, where each bit represents -one page of size *pgsize* in `VFIO Bitmap`_ . +The server responds with the original DMA entry in the request. -The total size of the total reply message is: -16 + 24 + (16 + *size* in `VFIO Bitmap`_ if *get dirty page bitmap* is set). ``VFIO_USER_DEVICE_GET_INFO`` ----------------------------- @@ -959,7 +917,7 @@ VFIO region info cap sparse mmap +----------+--------+------+ | offset | 8 | 8 | +----------+--------+------+ -| size | 16 | 9 | +| size | 16 | 8 | +----------+--------+------+ | ... | | | +----------+--------+------+ @@ -973,39 +931,6 @@ VFIO region info cap sparse mmap The VFIO sparse mmap area is defined in ```` (``struct vfio_region_info_cap_sparse_mmap``). -VFIO region type cap header -""""""""""""""""""""""""""" - -+------------------+---------------------------+ -| Name | Value | -+==================+===========================+ -| id | VFIO_REGION_INFO_CAP_TYPE | -+------------------+---------------------------+ -| version | 0x1 | -+------------------+---------------------------+ -| next | | -+------------------+---------------------------+ -| region info type | VFIO region info type | -+------------------+---------------------------+ - -This capability is defined when a region is specific to the device. - -VFIO region info type cap -""""""""""""""""""""""""" - -The VFIO region info type is defined in ```` -(``struct vfio_region_info_cap_type``). - -+---------+--------+------+ -| Name | Offset | Size | -+=========+========+======+ -| type | 0 | 4 | -+---------+--------+------+ -| subtype | 4 | 4 | -+---------+--------+------+ - -The only device-specific region type and subtype supported by vfio-user is -``VFIO_REGION_TYPE_MIGRATION`` (3) and ``VFIO_REGION_SUBTYPE_MIGRATION`` (1). ``VFIO_USER_DEVICE_GET_REGION_IO_FDS`` -------------------------------------- @@ -1071,7 +996,7 @@ Reply * *argsz* is the size of the region IO FD info structure plus the total size of the sub-region array. Thus, each array entry "i" is at offset - i * ((argsz - 16) / count). Note that currently this is 40 bytes for both IO + i * ((argsz - 32) / count). Note that currently this is 40 bytes for both IO FD types, but this is not to be relied on. As elsewhere, this indicates the full reply payload size needed. * *flags* must be zero @@ -1087,8 +1012,8 @@ Note that it is the client's responsibility to verify the requested values (for example, that the requested offset does not exceed the region's bounds). Each sub-region given in the response has one of two possible structures, -depending whether *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD`` (0) or -``VFIO_USER_IO_FD_TYPE_IOREGIONFD`` (1): +depending whether *type* is ``VFIO_USER_IO_FD_TYPE_IOEVENTFD`` or +``VFIO_USER_IO_FD_TYPE_IOREGIONFD``: Sub-Region IO FD info format (ioeventfd) """""""""""""""""""""""""""""""""""""""" @@ -1552,290 +1477,455 @@ Reply This command message is sent from the client to the server to reset the device. Neither the request or reply have a payload. -``VFIO_USER_DIRTY_PAGES`` -------------------------- +``VFIO_USER_REGION_WRITE_MULTI`` +-------------------------------- + +This message can be used to coalesce multiple device write operations +into a single messgage. It is only used as an optimization when the +outgoing message queue is relatively full. + +Request +^^^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| wr_cnt | 0 | 8 | ++---------+--------+----------+ +| wrs | 8 | variable | ++---------+--------+----------+ -This command is analogous to ``VFIO_IOMMU_DIRTY_PAGES``. It is sent by the client -to the server in order to control logging of dirty pages, usually during a live -migration. +* *wr_cnt* is the number of device writes coalesced in the message +* *wrs* is an array of device writes defined below -Dirty page tracking is optional for server implementation; clients should not -rely on it. +Single Device Write Format +"""""""""""""""""""""""""" + ++--------+--------+----------+ +| Name | Offset | Size | ++========+========+==========+ +| offset | 0 | 8 | ++--------+--------+----------+ +| region | 8 | 4 | ++--------+--------+----------+ +| count | 12 | 4 | ++--------+--------+----------+ +| data | 16 | 8 | ++--------+--------+----------+ + +* *offset* into the region being accessed. +* *region* is the index of the region being accessed. +* *count* is the size of the data to be transferred. This format can + only describe writes of 8 bytes or less. +* *data* is the data to write. + +Reply +^^^^^ + ++---------+--------+----------+ +| Name | Offset | Size | ++=========+========+==========+ +| wr_cnt | 0 | 8 | ++---------+--------+----------+ + +* *wr_cnt* is the number of device writes completed. + +``VFIO_USER_DEVICE_FEATURE`` +---------------------------- + +This command is analogous to ``VFIO_DEVICE_FEATURE``. It is used to get, set, or +probe feature data of the device. Request ^^^^^^^ -+-------+--------+-----------------------------------------+ -| Name | Offset | Size | -+=======+========+=========================================+ -| argsz | 0 | 4 | -+-------+--------+-----------------------------------------+ -| flags | 4 | 4 | -+-------+--------+-----------------------------------------+ -| | +-----+----------------------------------------+ | -| | | Bit | Definition | | -| | +=====+========================================+ | -| | | 0 | VFIO_IOMMU_DIRTY_PAGES_FLAG_START | | -| | +-----+----------------------------------------+ | -| | | 1 | VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | | -| | +-----+----------------------------------------+ | -| | | 2 | VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | | -| | +-----+----------------------------------------+ | -+-------+--------+-----------------------------------------+ - -* *argsz* is the size of the VFIO dirty bitmap info structure for - ``START/STOP``; and for ``GET_BITMAP``, the maximum size of the reply payload - -* *flags* defines the action to be performed by the server: - - * ``VFIO_IOMMU_DIRTY_PAGES_FLAG_START`` instructs the server to start logging - pages it dirties. Logging continues until explicitly disabled by - ``VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP``. - - * ``VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP`` instructs the server to stop logging - dirty pages. - - * ``VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP`` requests the server to return - the dirty bitmap for a specific IOVA range. The IOVA range is specified by - a "VFIO Bitmap Range" structure, which must immediately follow this - "VFIO Dirty Pages" structure. See `VFIO Bitmap Range Format`_. - This operation is only valid if logging of dirty pages has been previously - started. - - These flags are mutually exclusive with each other. - -This part of the request is analogous to VFIO's ``struct -vfio_iommu_type1_dirty_bitmap``. - -.. _VFIO Bitmap Range Format: - -VFIO Bitmap Range Format +The request payload for this message is a structure of the following format. + ++-------+--------+--------------------------------+ +| Name | Offset | Size | ++=======+========+================================+ +| argsz | 0 | 4 | ++-------+--------+--------------------------------+ +| flags | 4 | 4 | ++-------+--------+--------------------------------+ +| | +---------+---------------------------+ | +| | | Bit | Definition | | +| | +=========+===========================+ | +| | | 0 to 15 | Feature index | | +| | +---------+---------------------------+ | +| | | 16 | VFIO_DEVICE_FEATURE_GET | | +| | +---------+---------------------------+ | +| | | 17 | VFIO_DEVICE_FEATURE_SET | | +| | +---------+---------------------------+ | +| | | 18 | VFIO_DEVICE_FEATURE_PROBE | | +| | +---------+---------------------------+ | ++-------+--------+--------------------------------+ +| data | 8 | variable | ++-------+--------+--------------------------------+ + +* *argsz* is the maximum size of the reply payload. + +* *flags* defines the action to be performed by the server and upon which + feature: + + * The feature index consists of the least significant 16 bits of the flags + field, and can be accessed using the ``VFIO_DEVICE_FEATURE_MASK`` bit mask. + + * ``VFIO_DEVICE_FEATURE_GET`` instructs the server to get the data for the + given feature. + + * ``VFIO_DEVICE_FEATURE_SET`` instructs the server to set the feature data to + that given in the ``data`` field of the payload. + + * ``VFIO_DEVICE_FEATURE_PROBE`` instructs the server to probe for feature + support. If ``VFIO_DEVICE_FEATURE_GET`` and/or ``VFIO_DEVICE_FEATURE_SET`` + are also set, the probe will only return success if all of the indicated + methods are supported. + + ``VFIO_DEVICE_FEATURE_GET`` and ``VFIO_DEVICE_FEATURE_SET`` are mutually + exclusive, except for use with ``VFIO_DEVICE_FEATURE_PROBE``. + +* *data* is specific to the particular feature. It is not used for probing. + +This part of the request is analogous to VFIO's ``struct vfio_device_feature``. + +Reply +^^^^^ + +The reply payload must be the same as the request payload for setting or +probing a feature. For getting a feature's data, the data is added in the data +section and its length is added to ``argsz``. + +Device Features +^^^^^^^^^^^^^^^ + +The only device features supported by vfio-user are those related to migration, +although this may change in the future. They are a subset of those supported in +the VFIO implementation of the Linux kernel. + ++----------------------------------------+---------------+ +| Name | Feature Index | ++========================================+===============+ +| VFIO_DEVICE_FEATURE_MIGRATION | 1 | ++----------------------------------------+---------------+ +| VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE | 2 | ++----------------------------------------+---------------+ +| VFIO_DEVICE_FEATURE_DMA_LOGGING_START | 6 | ++----------------------------------------+---------------+ +| VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP | 7 | ++----------------------------------------+---------------+ +| VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT | 8 | ++----------------------------------------+---------------+ + +``VFIO_DEVICE_FEATURE_MIGRATION`` +""""""""""""""""""""""""""""""""" + +This feature indicates that the device can support the migration API through +``VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE``. If ``GET`` succeeds, the ``RUNNING`` +and ``ERROR`` states are always supported. Support for additional states is +indicated via the flags field; at least ``VFIO_MIGRATION_STOP_COPY`` must be +set. + +There is no data field of the request message. + +The data field of the reply message is structured as follows: + ++-------+--------+---------------------------+ +| Name | Offset | Size | ++=======+========+===========================+ +| flags | 0 | 8 | ++-------+--------+---------------------------+ +| | +-----+--------------------------+ | +| | | Bit | Definition | | +| | +=====+==========================+ | +| | | 0 | VFIO_MIGRATION_STOP_COPY | | +| | +-----+--------------------------+ | +| | | 1 | VFIO_MIGRATION_P2P | | +| | +-----+--------------------------+ | +| | | 2 | VFIO_MIGRATION_PRE_COPY | | +| | +-----+--------------------------+ | ++-------+--------+---------------------------+ + +These flags are interpreted in the same way as VFIO. + +``VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE`` +"""""""""""""""""""""""""""""""""""""""" + +Upon ``VFIO_DEVICE_FEATURE_SET``, execute a migration state change on the VFIO +device. The new state is supplied in ``device_state``. The state transition must +fully complete before the reply is sent. + +The data field of the reply message, as well as the ``SET`` request message, is +structured as follows: + ++--------------+--------+------+ +| Name | Offset | Size | ++==============+========+======+ +| device_state | 0 | 4 | ++--------------+--------+------+ +| data_fd | 4 | 4 | ++--------------+--------+------+ + +* *device_state* is the current state of the device (for ``GET``) or the + state to transition to (for ``SET``). It is defined by the + ``vfio_device_mig_state`` enum as detailed below. These states are the states + of the device migration Finite State Machine. + ++--------------------------------+-------+---------------------------------------------------------------------+ +| Name | State | Description | ++================================+=======+=====================================================================+ +| VFIO_DEVICE_STATE_ERROR | 0 | The device has failed and must be reset. | ++--------------------------------+-------+---------------------------------------------------------------------+ +| VFIO_DEVICE_STATE_STOP | 1 | The device does not change the internal or external state. | ++--------------------------------+-------+---------------------------------------------------------------------+ +| VFIO_DEVICE_STATE_RUNNING | 2 | The device is running normally. | ++--------------------------------+-------+---------------------------------------------------------------------+ +| VFIO_DEVICE_STATE_STOP_COPY | 3 | The device internal state can be read out. | ++--------------------------------+-------+---------------------------------------------------------------------+ +| VFIO_DEVICE_STATE_RESUMING | 4 | The device is stopped and is loading a new internal state. | ++--------------------------------+-------+---------------------------------------------------------------------+ +| VFIO_DEVICE_STATE_RUNNING_P2P | 5 | (not used in vfio-user) | ++--------------------------------+-------+---------------------------------------------------------------------+ +| VFIO_DEVICE_STATE_PRE_COPY | 6 | The device is running normally but tracking internal state changes. | ++--------------------------------+-------+---------------------------------------------------------------------+ +| VFIO_DEVICE_STATE_PRE_COPY_P2P | 7 | (not used in vfio-user) | ++--------------------------------+-------+---------------------------------------------------------------------+ + +* *data_fd* is unused in vfio-user, as the ``VFIO_USER_MIG_DATA_READ`` and + ``VFIO_USER_MIG_DATA_WRITE`` messages are used instead for migration data + transport. + +Direct State Transitions """""""""""""""""""""""" +The device migration FSM is a Mealy machine, so actions are taken upon the arcs +between FSM states. The following transitions need to be supported by the +server, a subset of those defined in ```` +(``enum vfio_device_mig_state``). + +* ``RUNNING -> STOP``, ``STOP_COPY -> STOP``: Stop the operation of the device. + The ``STOP_COPY`` arc terminates the data transfer session. + +* ``RESUMING -> STOP``: Terminate the data transfer session. Complete processing + of the migration data. Stop the operation of the device. If the delivered data + is found to be incomplete, inconsistent, or otherwise invalid, fail the + ``SET`` command and optionally transition to the ``ERROR`` state. + +* ``PRE_COPY -> RUNNING``: Terminate the data transfer session. The device is + now fully operational. + +* ``STOP -> RUNNING``: Start the operation of the device. + +* ``RUNNING -> PRE_COPY``, ``STOP -> STOP_COPY``: Begin the process of saving + the device state. The device operation is unchanged, but data transfer begins. + ``PRE_COPY`` and ``STOP_COPY`` are referred to as the "saving group" of + states. + +* ``PRE_COPY -> STOP_COPY``: Continue to transfer migration data, but stop + device operation. + +* ``STOP -> RESUMING``: Start the process of restoring the device state. The + internal device state may be changed to prepare the device to receive the + migration data. + +The ``STOP_COPY -> PRE_COPY`` transition is explicitly not allowed and should +return an error if requested. + +``ERROR`` cannot be specified as a device state, but any transition request can +be failed and then move the state into ``ERROR`` if the server was unable to +execute the requested arc AND was unable to restore the device into any valid +state. To recover from ``ERROR``, ``VFIO_USER_DEVICE_RESET`` must be used to +return back to ``RUNNING``. + +If ``PRE_COPY`` is not supported, arcs touching it are removed. + +Complex State Transitions +""""""""""""""""""""""""" + +The remaining possible transitions are to be implemented as combinations of the +above FSM arcs. As there are multiple paths, the path should be selected based +on the following rules: + +* Select the shortest path. + +* The path cannot have saving group states as interior arcs, only start/end + states. + +``VFIO_DEVICE_FEATURE_DMA_LOGGING_START`` / ``VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +Upon ``VFIO_DEVICE_FEATURE_SET``, start/stop DMA logging. These features can +also be probed to determine whether the device supports DMA logging. + +When DMA logging is started, a range of IOVAs to monitor is provided and the +device can optimize its logging to cover only the IOVA range given. Only DMA +writes are logged. + +The data field of the ``SET`` request is structured as follows: + ++------------+--------+----------+ +| Name | Offset | Size | ++============+========+==========+ +| page_size | 0 | 8 | ++------------+--------+----------+ +| num_ranges | 8 | 4 | ++------------+--------+----------+ +| reserved | 12 | 4 | ++------------+--------+----------+ +| ranges | 16 | variable | ++------------+--------+----------+ + +* *page_size* hints what tracking granularity the device should try to achieve. + If the device cannot do the hinted page size then it's the driver's choice + which page size to pick based on its support. On output the device will return + the page size it selected. + +* *num_ranges* is the number of IOVA ranges to monitor. A value of zero + indicates that all writes should be logged. + +* *ranges* is an array of ``vfio_user_device_feature_dma_logging_range`` + entries: + +--------+--------+------+ | Name | Offset | Size | +========+========+======+ | iova | 0 | 8 | +--------+--------+------+ -| size | 8 | 8 | -+--------+--------+------+ -| bitmap | 16 | 24 | +| length | 8 | 8 | +--------+--------+------+ -* *iova* is the IOVA offset + * *iova* is the base IO virtual address + * *length* is the length of the range to log + +Upon success, the response data field will be the same as the request, unless +the page size was changed, in which case this will be reflected in the response. + +``VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT`` +"""""""""""""""""""""""""""""""""""""""""" + +Upon ``VFIO_DEVICE_FEATURE_GET``, returns the dirty bitmap for a specific IOVA +range. This operation is only valid if logging of dirty pages has been +previously started by setting ``VFIO_DEVICE_FEATURE_DMA_LOGGING_START``. + +The data field of the request is structured as follows: + ++-----------+--------+------+ +| Name | Offset | Size | ++===========+========+======+ +| iova | 0 | 8 | ++-----------+--------+------+ +| length | 8 | 8 | ++-----------+--------+------+ +| page_size | 16 | 8 | ++-----------+--------+------+ + +* *iova* is the base IO virtual address + +* *length* is the length of the range + +* *page_size* is the unit of granularity of the bitmap, and must be a power of + two. It doesn't have to match the value given to + ``VFIO_DEVICE_FEATURE_DMA_LOGGING_START`` because the driver will format its + internal logging to match the reporting page size possibly by replicating bits + if the internal page size is lower than requested + +The data field of the response is identical, except with the bitmap added on +the end at offset 24. + +The bitmap is an array of u64s that holds the output bitmap, with 1 bit +reporting a *page_size* unit of IOVA. The bits outside of the requested range +must be zero. + +The mapping of IOVA to bits is given by: + +``bitmap[(addr - iova)/page_size] & (1ULL << (addr % 64))`` + +``VFIO_USER_MIG_DATA_READ`` +--------------------------- + +This command is used to read data from the source migration server while it is +in a saving group state (``PRE_COPY`` or ``STOP_COPY``). + +This command, and ``VFIO_USER_MIG_DATA_WRITE``, are used in place of the +``data_fd`` file descriptor in ```` +(``struct vfio_device_feature_mig_state``) to enable all data transport to use +the single already-established UNIX socket. Hence, the migration data is +treated like a stream, so the client must continue reading until no more +migration data remains. + +Request +^^^^^^^ + +The request payload for this message is a structure of the following format. -* *size* is the size of the IOVA region ++-------+--------+------+ +| Name | Offset | Size | ++=======+========+======+ +| argsz | 0 | 4 | ++-------+--------+------+ +| size | 4 | 4 | ++-------+--------+------+ -* *bitmap* is the VFIO Bitmap explained in `VFIO Bitmap`_. +* *argsz* is the maximum size of the reply payload. -This part of the request is analogous to VFIO's ``struct -vfio_iommu_type1_dirty_bitmap_get``. +* *size* is the size of the migration data to read. Reply ^^^^^ -For ``VFIO_IOMMU_DIRTY_PAGES_FLAG_START`` or -``VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP``, there is no reply payload. - -For ``VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP``, the reply payload is as follows: - -+--------------+--------+-----------------------------------------+ -| Name | Offset | Size | -+==============+========+=========================================+ -| argsz | 0 | 4 | -+--------------+--------+-----------------------------------------+ -| flags | 4 | 4 | -+--------------+--------+-----------------------------------------+ -| | +-----+----------------------------------------+ | -| | | Bit | Definition | | -| | +=====+========================================+ | -| | | 2 | VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | | -| | +-----+----------------------------------------+ | -+--------------+--------+-----------------------------------------+ -| bitmap range | 8 | 40 | -+--------------+--------+-----------------------------------------+ -| bitmap | 48 | variable | -+--------------+--------+-----------------------------------------+ - -* *argsz* is the size required for the full reply payload (dirty pages structure - + bitmap range structure + actual bitmap) -* *flags* is ``VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP`` -* *bitmap range* is the same bitmap range struct provided in the request, as - defined in `VFIO Bitmap Range Format`_. -* *bitmap* is the actual dirty pages bitmap corresponding to the range request - -VFIO Device Migration Info --------------------------- +The reply payload for this message is a structure of the following format. + ++-------+--------+----------+ +| Name | Offset | Size | ++=======+========+==========+ +| argsz | 0 | 4 | ++-------+--------+----------+ +| size | 4 | 4 | ++-------+--------+----------+ +| data | 8 | variable | ++-------+--------+----------+ -A device may contain a migration region (of type -``VFIO_REGION_TYPE_MIGRATION``). The beginning of the region must contain -``struct vfio_device_migration_info``, defined in ````. This -subregion is accessed like any other part of a standard vfio-user region -using ``VFIO_USER_REGION_READ``/``VFIO_USER_REGION_WRITE``. - -+---------------+--------+--------------------------------+ -| Name | Offset | Size | -+===============+========+================================+ -| device_state | 0 | 4 | -+---------------+--------+--------------------------------+ -| | +-----+-------------------------------+ | -| | | Bit | Definition | | -| | +=====+===============================+ | -| | | 0 | VFIO_DEVICE_STATE_V1_RUNNING | | -| | +-----+-------------------------------+ | -| | | 1 | VFIO_DEVICE_STATE_V1_SAVING | | -| | +-----+-------------------------------+ | -| | | 2 | VFIO_DEVICE_STATE_V1_RESUMING | | -| | +-----+-------------------------------+ | -+---------------+--------+--------------------------------+ -| reserved | 4 | 4 | -+---------------+--------+--------------------------------+ -| pending_bytes | 8 | 8 | -+---------------+--------+--------------------------------+ -| data_offset | 16 | 8 | -+---------------+--------+--------------------------------+ -| data_size | 24 | 8 | -+---------------+--------+--------------------------------+ - -* *device_state* defines the state of the device: - - The client initiates device state transition by writing the intended state. - The server must respond only after it has successfully transitioned to the new - state. If an error occurs then the server must respond to the - ``VFIO_USER_REGION_WRITE`` operation with the Error field set accordingly and - must remain at the previous state, or in case of internal error it must - transition to the error state, defined as - ``VFIO_DEVICE_STATE_V1_RESUMING | VFIO_DEVICE_STATE_V1_SAVING``. The client - must re-read the device state in order to determine it afresh. - - The following device states are defined: - - +-----------+---------+----------+-----------------------------------+ - | _RESUMING | _SAVING | _RUNNING | Description | - +===========+=========+==========+===================================+ - | 0 | 0 | 0 | Device is stopped. | - +-----------+---------+----------+-----------------------------------+ - | 0 | 0 | 1 | Device is running, default state. | - +-----------+---------+----------+-----------------------------------+ - | 0 | 1 | 0 | Stop-and-copy state | - +-----------+---------+----------+-----------------------------------+ - | 0 | 1 | 1 | Pre-copy state | - +-----------+---------+----------+-----------------------------------+ - | 1 | 0 | 0 | Resuming | - +-----------+---------+----------+-----------------------------------+ - | 1 | 0 | 1 | Invalid state | - +-----------+---------+----------+-----------------------------------+ - | 1 | 1 | 0 | Error state | - +-----------+---------+----------+-----------------------------------+ - | 1 | 1 | 1 | Invalid state | - +-----------+---------+----------+-----------------------------------+ - - Valid state transitions are shown in the following table: - - +-------------------------+---------+---------+---------------+----------+----------+ - | |darr| From / To |rarr| | Stopped | Running | Stop-and-copy | Pre-copy | Resuming | - +=========================+=========+=========+===============+==========+==========+ - | Stopped | \- | 1 | 0 | 0 | 0 | - +-------------------------+---------+---------+---------------+----------+----------+ - | Running | 1 | \- | 1 | 1 | 1 | - +-------------------------+---------+---------+---------------+----------+----------+ - | Stop-and-copy | 1 | 1 | \- | 0 | 0 | - +-------------------------+---------+---------+---------------+----------+----------+ - | Pre-copy | 0 | 0 | 1 | \- | 0 | - +-------------------------+---------+---------+---------------+----------+----------+ - | Resuming | 0 | 1 | 0 | 0 | \- | - +-------------------------+---------+---------+---------------+----------+----------+ - - A device is migrated to the destination as follows: - - * The source client transitions the device state from the running state to - the pre-copy state. This transition is optional for the client but must be - supported by the server. The source server starts sending device state data - to the source client through the migration region while the device is - running. - - * The source client transitions the device state from the running state or the - pre-copy state to the stop-and-copy state. The source server stops the - device, saves device state and sends it to the source client through the - migration region. - - The source client is responsible for sending the migration data to the - destination client. - - A device is resumed on the destination as follows: - - * The destination client transitions the device state from the running state - to the resuming state. The destination server uses the device state data - received through the migration region to resume the device. - - * The destination client provides saved device state to the destination - server and then transitions the device to back to the running state. - -* *reserved* This field is reserved and any access to it must be ignored by the - server. - -* *pending_bytes* Remaining bytes to be migrated by the server. This field is - read only. - -* *data_offset* Offset in the migration region where the client must: - - * read from, during the pre-copy or stop-and-copy state, or - - * write to, during the resuming state. - - This field is read only. - -* *data_size* Contains the size, in bytes, of the amount of data copied to: - - * the source migration region by the source server during the pre-copy or - stop-and copy state, or - - * the destination migration region by the destination client during the - resuming state. - -Device-specific data must be stored at any position after -``struct vfio_device_migration_info``. Note that the migration region can be -memory mappable, even partially. In practise, only the migration data portion -can be memory mapped. - -The client processes device state data during the pre-copy and the -stop-and-copy state in the following iterative manner: - - 1. The client reads ``pending_bytes`` to mark a new iteration. Repeated reads - of this field is an idempotent operation. If there are no migration data - to be consumed then the next step depends on the current device state: - - * pre-copy: the client must try again. +* *argsz* is the size of the above structure, including the size of the data. - * stop-and-copy: this procedure can end and the device can now start - resuming on the destination. +* *size* indicates the size of returned migration data. If this is less than the + requested size, there is no more migration data to read. - 2. The client reads ``data_offset``; at this point the server must make - available a portion of migration data at this offset to be read by the - client, which must happen *before* completing the read operation. The - amount of data to be read must be stored in the ``data_size`` field, which - the client reads next. +* *data* contains the migration data. - 3. The client reads ``data_size`` to determine the amount of migration data - available. +``VFIO_USER_MIG_DATA_WRITE`` +---------------------------- - 4. The client reads and processes the migration data. +This command is used to write data to the destination migration server while it +is in the ``RESUMING`` state. - 5. Go to step 1. +As above, this replaces the ``data_fd`` file descriptor for transport of +migration data, and as such, the migration data is treated like a stream. -Note that the client can transition the device from the pre-copy state to the -stop-and-copy state at any time; ``pending_bytes`` does not need to become zero. +Request +^^^^^^^ + +The request payload for this message is a structure of the following format. + ++-------+--------+----------+ +| Name | Offset | Size | ++=======+========+==========+ +| argsz | 0 | 4 | ++-------+--------+----------+ +| size | 4 | 4 | ++-------+--------+----------+ +| data | 8 | variable | ++-------+--------+----------+ + +* *argsz* is the maximum size of the reply payload. + +* *size* is the size of the migration data to be written. + +* *data* contains the migration data. -The client initializes the device state on the destination by setting the -device state in the resuming state and writing the migration data to the -destination migration region at ``data_offset`` offset. The client can write the -source migration data in an iterative manner and the server must consume this -data before completing each write operation, updating the ``data_offset`` field. -The server must apply the source migration data on the device resume state. The -client must write data on the same order and transaction size as read. +Reply +^^^^^ -If an error occurs then the server must fail the read or write operation. It is -an implementation detail of the client how to handle errors. +There is no reply payload for this message. Appendices ========== diff --git a/include/libvfio-user.h b/include/libvfio-user.h index 21cb99a5..e4cfa600 100644 --- a/include/libvfio-user.h +++ b/include/libvfio-user.h @@ -583,21 +583,8 @@ typedef enum { VFU_MIGR_STATE_RESUME } vfu_migr_state_t; -#define VFU_MIGR_CALLBACKS_VERS 1 +#define VFU_MIGR_CALLBACKS_VERS 2 -/* - * Callbacks during the pre-copy and stop-and-copy phases. - * - * The client executes the following steps to copy migration data: - * - * 1. get_pending_bytes: device must return amount of migration data - * 2. prepare_data: device must prepare migration data - * 3. read_data: device must provide migration data - * - * The client repeats the above steps until there is no more migration data to - * return (the device must return 0 from get_pending_bytes to indicate that - * there are no more migration data to be consumed in this iteration). - */ typedef struct { /* @@ -615,152 +602,30 @@ typedef struct { * FIXME maybe we should create a single callback and pass the state? */ int (*transition)(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state); - - /* Callbacks for saving device state */ - - /* - * Function that is called to retrieve the amount of pending migration - * data. If migration data were previously made available (function - * prepare_data has been called) then calling this function signifies that - * they have been read (e.g. migration data can be discarded). If the - * function returns 0 then migration has finished and this function won't - * be called again. - * - * The amount of pending migration data returned by the device does not - * necessarily have to monotonically decrease over time and does not need - * to match the amount of migration data returned via the @size argument in - * prepare_data. It can completely fluctuate according to the needs of the - * device. These semantics are derived from the pending_bytes register in - * VFIO. Therefore the value returned by get_pending_bytes must be - * primarily regarded as boolean, either 0 or non-zero, as far as migration - * completion is concerned. More advanced vfio-user clients can make - * assumptions on how migration is progressing on devices that guarantee - * that the amount of pending migration data decreases over time. - */ - uint64_t (*get_pending_bytes)(vfu_ctx_t *vfu_ctx); - - /* - * Function that is called to instruct the device to prepare migration data - * to be read when in pre-copy or stop-and-copy state, and to prepare for - * receiving migration data when in resuming state. - * - * When in pre-copy and stop-and-copy state, the function must return only - * after migration data are available at the specified offset. This - * callback is called once per iteration. The amount of data available - * pointed to by @size can be different that the amount of data returned by - * get_pending_bytes in the beginning of the iteration. - * - * In VFIO, the data_offset and data_size registers can be read multiple - * times during an iteration and are invariant, libvfio-user simplifies - * this by caching the values and returning them when read, guaranteeing - * that prepare_data() is called only once per migration iteration. - * - * When in resuming state, @offset must be set to where migration data must - * written. @size points to NULL. - * - * The callback should return -1 on error, setting errno. - */ - int (*prepare_data)(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size); - + /* - * Function that is called to read migration data. offset and size can be - * any subrange on the offset and size previously returned by prepare_data. - * The function must return the amount of data read or -1 on error, setting - * errno. + * Function that is called to read `count` bytes of migration data into + * `buf`. The function must return the amount of data read or -1 on error, + * setting errno. The function may return less data than requested. * - * This function can be called even if the migration data can be memory - * mapped. + * If the function returns zero, this is interpreted to mean that there is + * no more migration data to read. */ - ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf, - uint64_t count, uint64_t offset); - - /* Callbacks for restoring device state */ + ssize_t (*read_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count); /* - * Fuction that is called for writing previously stored device state. The + * Function that is called for writing previously stored device state. The * function must return the amount of data written or -1 on error, setting - * errno. - */ - ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count, - uint64_t offset); - - /* - * Function that is called when client has written some previously stored - * device state. - * - * The callback should return -1 on error, setting errno. + * errno. Partial writes are not supported, so any return value other than + * `count` is invalid. */ - int (*data_written)(vfu_ctx_t *vfu_ctx, uint64_t count); + ssize_t (*write_data)(vfu_ctx_t *vfu_ctx, void *buf, uint64_t count); } vfu_migration_callbacks_t; -/** - * The definition for VFIO_DEVICE_STATE_XXX differs with the version of vfio - * header file used. Some old systems wouldn't have these definitions. Some - * other newer systems would be using region based migration, and not - * have VFIO_DEVICE_STATE_V1_XXXX defined. The latest ones have - * VFIO_DEVICE_STATE_V1_XXXX defined. The following addresses all - * these scenarios. - */ -#if defined(VFIO_DEVICE_STATE_STOP) - -_Static_assert(VFIO_DEVICE_STATE_STOP == 0, - "incompatible VFIO_DEVICE_STATE_STOP definition"); - -#define VFIO_DEVICE_STATE_V1_STOP VFIO_DEVICE_STATE_STOP -#define VFIO_DEVICE_STATE_V1_RUNNING VFIO_DEVICE_STATE_RUNNING -#define VFIO_DEVICE_STATE_V1_SAVING VFIO_DEVICE_STATE_SAVING -#define VFIO_DEVICE_STATE_V1_RESUMING VFIO_DEVICE_STATE_RESUMING - -#elif !defined(VFIO_REGION_TYPE_MIGRATION_DEPRECATED) /* VFIO_DEVICE_STATE_STOP */ - -#define VFIO_DEVICE_STATE_V1_STOP (0) -#define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0) -#define VFIO_DEVICE_STATE_V1_SAVING (1 << 1) -#define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2) -#define VFIO_DEVICE_STATE_MASK ((1 << 3) - 1) - -#endif /* VFIO_REGION_TYPE_MIGRATION_DEPRECATED */ - -/* - * The currently defined migration registers; if using migration callbacks, - * these are handled internally by the library. - * - * This is analogous to struct vfio_device_migration_info. - */ -struct vfio_user_migration_info { - /* VFIO_DEVICE_STATE_* */ - uint32_t device_state; - uint32_t reserved; - uint64_t pending_bytes; - uint64_t data_offset; - uint64_t data_size; -}; - -/* - * Returns the size of the area needed to hold the migration registers at the - * beginning of the migration region; guaranteed to be page aligned. - */ -size_t -vfu_get_migr_register_area_size(void); - -/** - * vfu_setup_device_migration provides an abstraction over the migration - * protocol: the user specifies a set of callbacks which are called in response - * to client accesses of the migration region; the migration region read/write - * callbacks are not called after this function call. Offsets in callbacks are - * relative to @data_offset. - * - * @vfu_ctx: the libvfio-user context - * @callbacks: migration callbacks - * @data_offset: offset in the migration region where data begins. - * - * @returns 0 on success, -1 on error, sets errno. - */ int vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx, - const vfu_migration_callbacks_t *callbacks, - uint64_t data_offset); + const vfu_migration_callbacks_t *callbacks); /** * Triggers an interrupt. @@ -906,7 +771,6 @@ enum { VFU_PCI_DEV_ROM_REGION_IDX, VFU_PCI_DEV_CFG_REGION_IDX, VFU_PCI_DEV_VGA_REGION_IDX, - VFU_PCI_DEV_MIGR_REGION_IDX, VFU_PCI_DEV_NUM_REGIONS, }; diff --git a/include/vfio-user.h b/include/vfio-user.h index a749938c..0b115d32 100644 --- a/include/vfio-user.h +++ b/include/vfio-user.h @@ -66,7 +66,10 @@ enum vfio_user_command { VFIO_USER_DMA_READ = 11, VFIO_USER_DMA_WRITE = 12, VFIO_USER_DEVICE_RESET = 13, - VFIO_USER_DIRTY_PAGES = 14, + VFIO_USER_REGION_WRITE_MULTI = 15, + VFIO_USER_DEVICE_FEATURE = 16, + VFIO_USER_MIG_DATA_READ = 17, + VFIO_USER_MIG_DATA_WRITE = 18, VFIO_USER_MAX, }; @@ -200,31 +203,97 @@ typedef struct vfio_user_region_io_fds_reply { } sub_regions[]; } __attribute__((packed)) vfio_user_region_io_fds_reply_t; +/* Analogous to struct vfio_device_feature_dma_logging_range */ +struct vfio_user_device_feature_dma_logging_range { + uint64_t iova; + uint64_t length; +} __attribute__((packed)); -/* Analogous to vfio_iommu_type1_dirty_bitmap. */ -struct vfio_user_dirty_pages { - uint32_t argsz; -#ifndef VFIO_IOMMU_DIRTY_PAGES_FLAG_START -#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) -#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) -#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) -#endif - uint32_t flags; +/* Analogous to struct vfio_device_feature_dma_logging_control */ +struct vfio_user_device_feature_dma_logging_control { + uint64_t page_size; + uint32_t num_ranges; + uint32_t reserved; + struct vfio_user_device_feature_dma_logging_range ranges[]; } __attribute__((packed)); -/* Analogous to struct vfio_iommu_type1_dirty_bitmap_get. */ -struct vfio_user_bitmap_range { +/* Analogous to struct vfio_device_feature_dma_logging_report */ +struct vfio_user_device_feature_dma_logging_report { uint64_t iova; - uint64_t size; - struct vfio_user_bitmap bitmap; + uint64_t length; + uint64_t page_size; + uint8_t bitmap[]; +} __attribute__((packed)); + +#ifndef VFIO_DEVICE_FEATURE_DMA_LOGGING_START +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6 +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7 +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 +#endif + +/* Analogous to struct vfio_device_feature */ +struct vfio_user_device_feature { + uint32_t argsz; + uint32_t flags; +#ifndef VFIO_DEVICE_FEATURE_MASK +#define VFIO_DEVICE_FEATURE_MASK (0xffff) /* 16-bit feature index */ +#define VFIO_DEVICE_FEATURE_GET (1 << 16) /* Get feature into data[] */ +#define VFIO_DEVICE_FEATURE_SET (1 << 17) /* Set feature from data[] */ +#define VFIO_DEVICE_FEATURE_PROBE (1 << 18) /* Probe feature support */ +#endif + uint8_t data[]; +} __attribute__((packed)); + +/* Analogous to struct vfio_device_feature_migration */ +struct vfio_user_device_feature_migration { + uint64_t flags; +#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED +#define VFIO_MIGRATION_STOP_COPY (1 << 0) +#define VFIO_MIGRATION_P2P (1 << 1) +#endif +/* + * PRE_COPY was added in a later kernel version, after + * VFIO_REGION_TYPE_MIGRATION_DEPRECATED had been introduced. + */ +#ifndef VFIO_MIGRATION_PRE_COPY +#define VFIO_MIGRATION_PRE_COPY (1 << 2) +#endif } __attribute__((packed)); +#ifndef VFIO_REGION_TYPE_MIGRATION_DEPRECATED +#define VFIO_DEVICE_FEATURE_MIGRATION 1 +#endif +_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8, + "bad vfio_user_device_feature_migration size"); -#ifndef VFIO_REGION_TYPE_MIGRATION +/* Analogous to struct vfio_device_feature_mig_state */ +struct vfio_user_device_feature_mig_state { + uint32_t device_state; + uint32_t data_fd; +} __attribute__((packed)); +#ifndef VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE +#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2 +#endif +_Static_assert(sizeof(struct vfio_user_device_feature_migration) == 8, + "bad vfio_user_device_feature_mig_state size"); -#define VFIO_REGION_TYPE_MIGRATION (3) -#define VFIO_REGION_SUBTYPE_MIGRATION (1) +/* Analogous to enum vfio_device_mig_state */ +enum vfio_user_device_mig_state { + VFIO_USER_DEVICE_STATE_ERROR = 0, + VFIO_USER_DEVICE_STATE_STOP = 1, + VFIO_USER_DEVICE_STATE_RUNNING = 2, + VFIO_USER_DEVICE_STATE_STOP_COPY = 3, + VFIO_USER_DEVICE_STATE_RESUMING = 4, + VFIO_USER_DEVICE_STATE_RUNNING_P2P = 5, + VFIO_USER_DEVICE_STATE_PRE_COPY = 6, + VFIO_USER_DEVICE_STATE_PRE_COPY_P2P = 7, + VFIO_USER_DEVICE_NUM_STATES = 8, +}; -#endif /* VFIO_REGION_TYPE_MIGRATION */ +struct vfio_user_mig_data { + uint32_t argsz; + uint32_t size; + uint8_t data[]; +} __attribute__((packed)); #ifdef __cplusplus } diff --git a/lib/common.h b/lib/common.h index 07a74a5c..40b9b27c 100644 --- a/lib/common.h +++ b/lib/common.h @@ -41,6 +41,7 @@ #include #include #include +#include #define UNUSED __attribute__((unused)) #define EXPORT __attribute__((visibility("default"))) @@ -62,6 +63,20 @@ typedef unsigned long long ull_t; +static inline int +ERROR_INT(int err) +{ + errno = err; + return -1; +} + +static inline void * +ERROR_PTR(int err) +{ + errno = err; + return NULL; +} + /* Saturating uint64_t addition. */ static inline uint64_t satadd_u64(uint64_t a, uint64_t b) @@ -73,11 +88,21 @@ satadd_u64(uint64_t a, uint64_t b) /* * The size, in bytes, of the bitmap that represents the given range with the * given page size. + * + * Returns -1 and sets errno if the given page size is invalid for the given + * range. */ -static inline size_t -_get_bitmap_size(size_t size, size_t pgsize) +static inline ssize_t +get_bitmap_size(size_t region_size, size_t pgsize) { - size_t nr_pages = (size / pgsize) + (size % pgsize != 0); + if (pgsize == 0) { + return ERROR_INT(EINVAL); + } + if (region_size < pgsize) { + return ERROR_INT(EINVAL); + } + + size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0); return ROUND_UP(nr_pages, sizeof(uint64_t) * CHAR_BIT) / CHAR_BIT; } @@ -107,6 +132,16 @@ close_safely(int *fd) errno = saved_errno; } +static inline void +iov_free(struct iovec *iov) +{ + if (iov->iov_base != NULL) { + free(iov->iov_base); + iov->iov_base = NULL; + } + iov->iov_len = 0; +} + #ifdef UNIT_TEST #define MOCK_DEFINE(f) \ diff --git a/lib/dma.c b/lib/dma.c index 9ca34d00..10e38ff3 100644 --- a/lib/dma.c +++ b/lib/dma.c @@ -255,19 +255,6 @@ dma_map_region(dma_controller_t *dma, dma_memory_region_t *region) return 0; } -static ssize_t -get_bitmap_size(size_t region_size, size_t pgsize) -{ - if (pgsize == 0) { - return ERROR_INT(EINVAL); - } - if (region_size < pgsize) { - return ERROR_INT(EINVAL); - } - - return _get_bitmap_size(region_size, pgsize); -} - static int dirty_page_logging_start_on_region(dma_memory_region_t *region, size_t pgsize) { @@ -530,28 +517,173 @@ dma_controller_dirty_page_logging_stop(dma_controller_t *dma) #ifdef DEBUG static void log_dirty_bitmap(vfu_ctx_t *vfu_ctx, dma_memory_region_t *region, - char *bitmap, size_t size) + char *bitmap, size_t size, size_t pgsize) { size_t i; size_t count; for (i = 0, count = 0; i < size; i++) { count += __builtin_popcount((uint8_t)bitmap[i]); } - vfu_log(vfu_ctx, LOG_DEBUG, "dirty pages: get [%p, %p), %zu dirty pages", + vfu_log(vfu_ctx, LOG_DEBUG, + "dirty pages: get [%p, %p), %zu dirty pages of size %zu", region->info.iova.iov_base, iov_end(®ion->info.iova), - count); + count, pgsize); } #endif +static void +dirty_page_exchange(uint8_t *outp, uint8_t *bitmap) +{ + /* + * If no bits are dirty, avoid the atomic exchange. This is obviously + * racy, but it's OK: if we miss a dirty bit being set, we'll catch it + * the next time around. + * + * Otherwise, atomically exchange the dirty bits with zero: as we use + * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might + * miss a bit being set after, but again, we'll catch that next time + * around. + */ + if (*bitmap == 0) { + *outp = 0; + } else { + uint8_t zero = 0; + __atomic_exchange(bitmap, &zero, outp, __ATOMIC_SEQ_CST); + } +} + +static void +dirty_page_get_same_pgsize(dma_memory_region_t *region, char *bitmap, + size_t bitmap_size) +{ + for (size_t i = 0; i < bitmap_size; i++) { + dirty_page_exchange((uint8_t *)&bitmap[i], ®ion->dirty_bitmap[i]); + } +} + +static void +dirty_page_get_extend(dma_memory_region_t *region, char *bitmap, + size_t server_bitmap_size, size_t server_pgsize, + size_t client_bitmap_size, size_t client_pgsize) +{ + /* + * The index of the bit in the client bitmap that we are currently + * considering. By keeping track of this separately to the for loop, we + * allow for one server bit to be repeated for multiple client bytes. + */ + uint8_t client_bit_idx = 0; + size_t server_byte_idx; + int server_bit_idx; + size_t factor = server_pgsize / client_pgsize; + + /* + * Iterate through the bytes of the server bitmap. + */ + for (server_byte_idx = 0; server_byte_idx < server_bitmap_size; + server_byte_idx++) { + + if (client_bit_idx / CHAR_BIT >= client_bitmap_size) { + break; + } + + uint8_t out = 0; + + dirty_page_exchange(&out, ®ion->dirty_bitmap[server_byte_idx]); + + /* + * Iterate through the bits of the server byte, repeating bits to reach + * the desired page size. + */ + for (server_bit_idx = 0; server_bit_idx < CHAR_BIT; server_bit_idx++) { + uint8_t server_bit = (out >> server_bit_idx) & 1; + + /* + * Repeat `factor` times the bit at index `j` of `out`. + * + * OR the same bit from the server bitmap (`server_bit`) with + * `factor` bits in the client bitmap, from `client_bit_idx` to + * `end_client_bit_idx`. + */ + for (size_t end_client_bit_idx = client_bit_idx + factor; + client_bit_idx < end_client_bit_idx; + client_bit_idx++) { + + bitmap[client_bit_idx / CHAR_BIT] |= + server_bit << (client_bit_idx % CHAR_BIT); + } + } + } +} + +static void +dirty_page_get_combine(dma_memory_region_t *region, char *bitmap, + size_t server_bitmap_size, size_t server_pgsize, + size_t client_bitmap_size, size_t client_pgsize) +{ + /* + * The index of the bit in the client bitmap that we are currently + * considering. By keeping track of this separately to the for loop, we + * allow multiple bytes' worth of server bits to be OR'd together to + * calculate one client bit. + */ + uint8_t client_bit_idx = 0; + size_t server_byte_idx; + int server_bit_idx; + size_t factor = client_pgsize / server_pgsize; + + /* + * Iterate through the bytes of the server bitmap. + */ + for (server_byte_idx = 0; server_byte_idx < server_bitmap_size; + server_byte_idx++) { + + if (client_bit_idx / CHAR_BIT >= client_bitmap_size) { + break; + } + + uint8_t out = 0; + + dirty_page_exchange(&out, ®ion->dirty_bitmap[server_byte_idx]); + + /* + * Iterate through the bits of the server byte, combining bits to reach + * the desired page size. + */ + for (server_bit_idx = 0; server_bit_idx < CHAR_BIT; server_bit_idx++) { + uint8_t server_bit = (out >> server_bit_idx) & 1; + + /* + * OR `factor` bits of the server bitmap with the same bit at + * index `client_bit_idx` in the client bitmap. + */ + bitmap[client_bit_idx / CHAR_BIT] |= + server_bit << (client_bit_idx % CHAR_BIT); + + /* + * Only move onto the next bit in the client bitmap once we've + * OR'd `factor` bits. + */ + if (((server_byte_idx * CHAR_BIT) + server_bit_idx) % factor + == factor - 1) { + client_bit_idx++; + + if (client_bit_idx / CHAR_BIT >= client_bitmap_size) { + return; + } + } + } + } +} + int dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, - uint64_t len, size_t pgsize, size_t size, + uint64_t len, size_t client_pgsize, size_t size, char *bitmap) { dma_memory_region_t *region; - ssize_t bitmap_size; + ssize_t server_bitmap_size; + ssize_t client_bitmap_size; dma_sg_t sg; - size_t i; int ret; assert(dma != NULL); @@ -574,24 +706,40 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, return ERROR_INT(ENOTSUP); } - if (pgsize != dma->dirty_pgsize) { - vfu_log(dma->vfu_ctx, LOG_ERR, "bad page size %zu", pgsize); + /* + * If dirty page logging is not enabled, the requested page size is zero, + * or the requested page size is not a power of two, return an error. + */ + if (dma->dirty_pgsize == 0) { + vfu_log(dma->vfu_ctx, LOG_ERR, "dirty page logging not enabled"); + return ERROR_INT(EINVAL); + } + if (client_pgsize == 0 || (client_pgsize & (client_pgsize - 1)) != 0) { + vfu_log(dma->vfu_ctx, LOG_ERR, "bad client page size %zu", + client_pgsize); return ERROR_INT(EINVAL); } - bitmap_size = get_bitmap_size(len, pgsize); - if (bitmap_size < 0) { - vfu_log(dma->vfu_ctx, LOG_ERR, "failed to get bitmap size"); - return bitmap_size; + server_bitmap_size = get_bitmap_size(len, dma->dirty_pgsize); + if (server_bitmap_size < 0) { + vfu_log(dma->vfu_ctx, LOG_ERR, "failed to get server bitmap size"); + return server_bitmap_size; + } + + client_bitmap_size = get_bitmap_size(len, client_pgsize); + if (client_bitmap_size < 0) { + vfu_log(dma->vfu_ctx, LOG_ERR, "bad client page size %zu", + client_pgsize); + return client_bitmap_size; } /* * They must be equal because this is how much data the client expects to * receive. */ - if (size != (size_t)bitmap_size) { - vfu_log(dma->vfu_ctx, LOG_ERR, "bad bitmap size %zu != %zu", size, - bitmap_size); + if (size != (size_t)client_bitmap_size) { + vfu_log(dma->vfu_ctx, LOG_ERR, "bad client bitmap size %zu != %zu", + size, client_bitmap_size); return ERROR_INT(EINVAL); } @@ -602,31 +750,29 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, return ERROR_INT(EINVAL); } - for (i = 0; i < (size_t)bitmap_size; i++) { - uint8_t val = region->dirty_bitmap[i]; - uint8_t *outp = (uint8_t *)&bitmap[i]; - + if (client_pgsize == dma->dirty_pgsize) { + dirty_page_get_same_pgsize(region, bitmap, client_bitmap_size); + } else if (client_pgsize < dma->dirty_pgsize) { /* - * If no bits are dirty, avoid the atomic exchange. This is obviously - * racy, but it's OK: if we miss a dirty bit being set, we'll catch it - * the next time around. - * - * Otherwise, atomically exchange the dirty bits with zero: as we use - * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might - * miss a bit being set after, but again, we'll catch that next time - * around. + * If the requested page size is less than that used for logging by + * the server, the bitmap will need to be extended, repeating bits. */ - if (val == 0) { - *outp = 0; - } else { - uint8_t zero = 0; - __atomic_exchange(®ion->dirty_bitmap[i], &zero, - outp, __ATOMIC_SEQ_CST); - } + dirty_page_get_extend(region, bitmap, server_bitmap_size, + dma->dirty_pgsize, client_bitmap_size, + client_pgsize); + } else { + /* + * If the requested page size is larger than that used for logging by + * the server, the bitmap will need to combine bits with OR, losing + * accuracy. + */ + dirty_page_get_combine(region, bitmap, server_bitmap_size, + dma->dirty_pgsize, client_bitmap_size, + client_pgsize); } #ifdef DEBUG - log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size); + log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size, client_pgsize); #endif return 0; diff --git a/lib/dma.h b/lib/dma.h index 9687f492..789904f7 100644 --- a/lib/dma.h +++ b/lib/dma.h @@ -386,6 +386,7 @@ int dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr, uint64_t len, size_t pgsize, size_t size, char *bitmap); + bool dma_sg_is_mappable(const dma_controller_t *dma, const dma_sg_t *sg); diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index 271a269a..81b00108 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -83,21 +83,16 @@ vfu_log(vfu_ctx_t *vfu_ctx, int level, const char *fmt, ...) } static size_t -get_vfio_caps_size(bool is_migr_reg, vfu_reg_info_t *reg) +get_vfio_caps_size(vfu_reg_info_t *reg) { - size_t type_size = 0; size_t sparse_size = 0; - if (is_migr_reg) { - type_size = sizeof(struct vfio_region_info_cap_type); - } - if (reg->nr_mmap_areas != 0) { sparse_size = sizeof(struct vfio_region_info_cap_sparse_mmap) + (reg->nr_mmap_areas * sizeof(struct vfio_region_sparse_mmap_area)); } - return type_size + sparse_size; + return sparse_size; } /* @@ -106,7 +101,7 @@ get_vfio_caps_size(bool is_migr_reg, vfu_reg_info_t *reg) * points accordingly. */ static int -dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg, bool is_migr_reg, +dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg, struct vfio_region_info *vfio_reg, int **fds, size_t *nr_fds) { struct vfio_info_cap_header *header; @@ -120,16 +115,6 @@ dev_get_caps(vfu_ctx_t *vfu_ctx, vfu_reg_info_t *vfu_reg, bool is_migr_reg, header = (struct vfio_info_cap_header*)(vfio_reg + 1); - if (is_migr_reg) { - type = (struct vfio_region_info_cap_type *)header; - type->header.id = VFIO_REGION_INFO_CAP_TYPE; - type->header.version = 1; - type->header.next = 0; - type->type = VFIO_REGION_TYPE_MIGRATION; - type->subtype = VFIO_REGION_SUBTYPE_MIGRATION; - vfio_reg->cap_offset = sizeof(struct vfio_region_info); - } - if (vfu_reg->mmap_areas != NULL) { int i, nr_mmap_areas = vfu_reg->nr_mmap_areas; if (type != NULL) { @@ -218,14 +203,6 @@ region_access(vfu_ctx_t *vfu_ctx, size_t region, char *buf, if (ret == -1) { goto out; } - } else if (region == VFU_PCI_DEV_MIGR_REGION_IDX) { - if (vfu_ctx->migration == NULL) { - vfu_log(vfu_ctx, LOG_ERR, "migration not enabled"); - ret = ERROR_INT(EINVAL); - goto out; - } - - ret = migration_region_access(vfu_ctx, buf, count, offset, is_write); } else { vfu_region_access_cb_t *cb = vfu_ctx->reg_info[region].cb; @@ -293,8 +270,7 @@ is_valid_region_access(vfu_ctx_t *vfu_ctx, size_t size, uint16_t cmd, return false; } - if (unlikely(device_is_stopped_and_copying(vfu_ctx->migration) && - index != VFU_PCI_DEV_MIGR_REGION_IDX)) { + if (unlikely(device_is_stopped_and_copying(vfu_ctx->migration))) { vfu_log(vfu_ctx, LOG_ERR, "cannot access region %zu while device in stop-and-copy state", index); @@ -421,8 +397,7 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) vfu_reg = &vfu_ctx->reg_info[in_info->index]; if (vfu_reg->size > 0) { - caps_size = get_vfio_caps_size(in_info->index == VFU_PCI_DEV_MIGR_REGION_IDX, - vfu_reg); + caps_size = get_vfio_caps_size(vfu_reg); } msg->out.iov.iov_len = MIN(sizeof(*out_info) + caps_size, in_info->argsz); @@ -457,9 +432,8 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) /* Only actually provide the caps if they fit. */ if (in_info->argsz >= out_info->argsz) { out_info->flags |= VFIO_REGION_INFO_FLAG_CAPS; - ret = dev_get_caps(vfu_ctx, vfu_reg, - in_info->index == VFU_PCI_DEV_MIGR_REGION_IDX, - out_info, &msg->out.fds, &msg->out.nr_fds); + ret = dev_get_caps(vfu_ctx, vfu_reg, out_info, &msg->out.fds, + &msg->out.nr_fds); if (ret < 0) { return ret; } @@ -917,133 +891,320 @@ static int device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t reason) { int ret; - + ret = call_reset_cb(vfu_ctx, reason); if (ret < 0) { return ret; } if (vfu_ctx->migration != NULL) { - return handle_device_state(vfu_ctx, vfu_ctx->migration, - VFIO_DEVICE_STATE_V1_RUNNING, false); + migr_state_transition(vfu_ctx->migration, + VFIO_USER_DEVICE_STATE_RUNNING); } return 0; } -static int -handle_dirty_pages_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) +static uint32_t +device_feature_flags_supported(vfu_ctx_t *vfu_ctx, uint32_t feature) { - struct vfio_user_dirty_pages *dirty_pages_in; - struct vfio_user_dirty_pages *dirty_pages_out; - struct vfio_user_bitmap_range *range_in; - struct vfio_user_bitmap_range *range_out; - size_t argsz; - int ret; + if (vfu_ctx->migration == NULL) { + /* + * All of the current features require migration. + */ + return 0; + } + switch (feature) { + case VFIO_DEVICE_FEATURE_MIGRATION: + case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: + return VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE; + case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: + return VFIO_DEVICE_FEATURE_GET + | VFIO_DEVICE_FEATURE_SET + | VFIO_DEVICE_FEATURE_PROBE; + case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: + case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: + return VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_PROBE; + default: + return 0; + }; +} - dirty_pages_in = msg->in.iov.iov_base; +static bool +is_migration_feature(uint32_t feature) +{ + switch (feature) { + case VFIO_DEVICE_FEATURE_MIGRATION: + case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: + return true; + } - if (msg->in.iov.iov_len < sizeof(*dirty_pages_in) + sizeof(*range_in) || - dirty_pages_in->argsz > SERVER_MAX_DATA_XFER_SIZE || - dirty_pages_in->argsz < sizeof(*dirty_pages_out)) { - vfu_log(vfu_ctx, LOG_ERR, "invalid message size=%zu argsz=%u", - msg->in.iov.iov_len, dirty_pages_in->argsz); - return ERROR_INT(EINVAL); + return false; +} + +static bool +is_dma_feature(uint32_t feature) +{ + switch (feature) { + case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: + case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: + case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: + return true; } - range_in = msg->in.iov.iov_base + sizeof(*dirty_pages_in); + return false; +} - /* - * range_in is client-controlled, but we only need to protect against - * overflow here: we'll take MIN() against a validated value next, and - * dma_controller_dirty_page_get() will validate the actual ->bitmap.size - * value later, anyway. +static int +handle_migration_device_feature_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg, + struct vfio_user_device_feature *req) +{ + /* + * All supported outgoing data is currently the same size as + * struct vfio_user_device_feature_migration. */ - argsz = satadd_u64(sizeof(*dirty_pages_out) + sizeof(*range_out), - range_in->bitmap.size); + msg->out.iov.iov_len = sizeof(struct vfio_user_device_feature) + + sizeof(struct vfio_user_device_feature_migration); + + if (req->argsz < msg->out.iov.iov_len) { + iov_free(&msg->out.iov); + return ERROR_INT(EINVAL); + } + + msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len); - msg->out.iov.iov_len = MIN(dirty_pages_in->argsz, argsz); - msg->out.iov.iov_base = malloc(msg->out.iov.iov_len); if (msg->out.iov.iov_base == NULL) { - return -1; + return ERROR_INT(ENOMEM); } - dirty_pages_out = msg->out.iov.iov_base; - memcpy(dirty_pages_out, dirty_pages_in, sizeof(*dirty_pages_out)); - dirty_pages_out->argsz = argsz; - /* - * If the reply doesn't fit, reply with just the dirty pages header, giving - * the needed argsz. Typically this shouldn't happen, as the client knows - * the needed reply size and has already provided the correct bitmap size. - */ - if (dirty_pages_in->argsz >= argsz) { - void *bitmap_out = msg->out.iov.iov_base + sizeof(*dirty_pages_out) - + sizeof(*range_out); - range_out = msg->out.iov.iov_base + sizeof(*dirty_pages_out); - memcpy(range_out, range_in, sizeof(*range_out)); - ret = dma_controller_dirty_page_get(vfu_ctx->dma, - (vfu_dma_addr_t)(uintptr_t)range_in->iova, - range_in->size, - range_in->bitmap.pgsize, - range_in->bitmap.size, bitmap_out); - if (ret != 0) { - ret = errno; - vfu_log(vfu_ctx, LOG_WARNING, - "failed to get dirty bitmap from DMA controller: %m"); - free(msg->out.iov.iov_base); - msg->out.iov.iov_base = NULL; - msg->out.iov.iov_len = 0; - return ERROR_INT(ret); + memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base, + sizeof(struct vfio_user_device_feature)); + + struct vfio_user_device_feature *res = msg->out.iov.iov_base; + res->argsz = msg->out.iov.iov_len; + + switch (req->flags & VFIO_DEVICE_FEATURE_MASK) { + case VFIO_DEVICE_FEATURE_MIGRATION: { + struct vfio_user_device_feature_migration *mig = + (void *)res->data; + // FIXME are these always supported? Can we consider to be + // "supported" if said support is just an empty callback? + // + // We don't need to return RUNNING or ERROR since they are + // always supported. + mig->flags = VFIO_MIGRATION_STOP_COPY + | VFIO_MIGRATION_PRE_COPY; + return 0; } - } else { - vfu_log(vfu_ctx, LOG_ERR, - "dirty pages: get [%#llx, %#llx): buffer too small (%u < %zu)", - (ull_t)range_in->iova, (ull_t)range_in->iova + range_in->size, - dirty_pages_in->argsz, argsz); + + case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE: { + struct vfio_user_device_feature_mig_state *state = + (void *)res->data; + state->device_state = migration_get_state(vfu_ctx); + return 0; + } + + default: + vfu_log(vfu_ctx, LOG_ERR, "invalid flags for migration GET (%d)", + req->flags); + return ERROR_INT(EINVAL); } +} - return 0; +static int +handle_migration_device_feature_set(vfu_ctx_t *vfu_ctx, uint32_t feature, + struct vfio_user_device_feature *res) +{ + assert(feature == VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE); + + struct vfio_user_device_feature_mig_state *state = (void *)res->data; + + return migration_set_state(vfu_ctx, state->device_state); } static int -handle_dirty_pages(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) +handle_dma_device_feature_get(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg, + struct vfio_user_device_feature *req) { - struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base; - int ret; + const size_t header_size = sizeof(struct vfio_user_device_feature) + + sizeof(struct vfio_user_device_feature_dma_logging_report); + + struct vfio_user_device_feature_dma_logging_report *rep = + (void *)req->data; + + dma_controller_t *dma = vfu_ctx->dma; + + if (dma == NULL) { + vfu_log(vfu_ctx, LOG_ERR, "DMA not enabled for DMA device feature"); + return ERROR_INT(EINVAL); + } + + ssize_t bitmap_size = get_bitmap_size(rep->length, rep->page_size); + if (bitmap_size < 0) { + return bitmap_size; + } + + msg->out.iov.iov_len = header_size + bitmap_size; + + if (req->argsz < msg->out.iov.iov_len) { + iov_free(&msg->out.iov); + return ERROR_INT(EINVAL); + } + + msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len); + + if (msg->out.iov.iov_base == NULL) { + return ERROR_INT(ENOMEM); + } + + memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base, header_size); + + struct vfio_user_device_feature *res = msg->out.iov.iov_base; + + res->argsz = msg->out.iov.iov_len; + char *bitmap = (char *)msg->out.iov.iov_base + header_size; + + int ret = dma_controller_dirty_page_get(dma, + (vfu_dma_addr_t) rep->iova, + rep->length, + rep->page_size, + bitmap_size, + bitmap); + + if (ret < 0) { + iov_free(&msg->out.iov); + } + + return ret; +} + +static int +handle_dma_device_feature_set(vfu_ctx_t *vfu_ctx, uint32_t feature, + struct vfio_user_device_feature *res) +{ + dma_controller_t *dma = vfu_ctx->dma; + + assert(dma != NULL); + + if (feature == VFIO_DEVICE_FEATURE_DMA_LOGGING_START) { + struct vfio_user_device_feature_dma_logging_control *ctl = + (void *)res->data; + return dma_controller_dirty_page_logging_start(dma, + ctl->page_size); + } + + assert(feature == VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP); + + dma_controller_dirty_page_logging_stop(dma); + return 0; +} + +static int +handle_device_feature(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) +{ assert(vfu_ctx != NULL); assert(msg != NULL); - if (msg->in.iov.iov_len < sizeof(*dirty_pages) || - dirty_pages->argsz < sizeof(*dirty_pages)) { - vfu_log(vfu_ctx, LOG_ERR, "invalid message size %zu", msg->in.iov.iov_len); + if (msg->in.iov.iov_len < sizeof(struct vfio_user_device_feature)) { + vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)", + msg->in.iov.iov_len); return ERROR_INT(EINVAL); } - if (vfu_ctx->migration == NULL) { - vfu_log(vfu_ctx, LOG_ERR, "migration not configured"); - return ERROR_INT(ENOTSUP); + struct vfio_user_device_feature *req = msg->in.iov.iov_base; + + uint32_t operations = req->flags & ~VFIO_DEVICE_FEATURE_MASK; + uint32_t feature = req->flags & VFIO_DEVICE_FEATURE_MASK; + + uint32_t supported_ops = device_feature_flags_supported(vfu_ctx, feature); + + if ((req->flags & supported_ops) != operations || supported_ops == 0) { + vfu_log(vfu_ctx, LOG_ERR, "unsupported operation(s), flags=%d", + req->flags); + return ERROR_INT(EINVAL); } - switch (dirty_pages->flags) { - case VFIO_IOMMU_DIRTY_PAGES_FLAG_START: - ret = dma_controller_dirty_page_logging_start(vfu_ctx->dma, - migration_get_pgsize(vfu_ctx->migration)); - break; + ssize_t ret; - case VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP: - dma_controller_dirty_page_logging_stop(vfu_ctx->dma); - ret = 0; - break; + switch (operations) { + case VFIO_DEVICE_FEATURE_GET: { + if (is_migration_feature(feature)) { + ret = handle_migration_device_feature_get(vfu_ctx, msg, req); + } else if (is_dma_feature(feature)) { + ret = handle_dma_device_feature_get(vfu_ctx, msg, req); + } else { + vfu_log(vfu_ctx, LOG_ERR, "unsupported feature %d for GET", + feature); + return ERROR_INT(EINVAL); + } + break; + } - case VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP: - ret = handle_dirty_pages_get(vfu_ctx, msg); - break; + case VFIO_DEVICE_FEATURE_SET: { + msg->out.iov.iov_len = msg->in.iov.iov_len; - default: - vfu_log(vfu_ctx, LOG_ERR, "bad flags %#x", dirty_pages->flags); - ret = ERROR_INT(EINVAL); - break; + if (req->argsz < msg->out.iov.iov_len) { + vfu_log(vfu_ctx, LOG_ERR, "bad argsz (%d<%ld)", req->argsz, + msg->out.iov.iov_len); + iov_free(&msg->out.iov); + return ERROR_INT(EINVAL); + } + + msg->out.iov.iov_base = malloc(msg->out.iov.iov_len); + + if (msg->out.iov.iov_base == NULL) { + return ERROR_INT(ENOMEM); + } + + memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base, + msg->out.iov.iov_len); + + struct vfio_user_device_feature *res = msg->out.iov.iov_base; + + if (is_migration_feature(feature)) { + ret = handle_migration_device_feature_set(vfu_ctx, feature, res); + } else if (is_dma_feature(feature)) { + ret = handle_dma_device_feature_set(vfu_ctx, feature, res); + } else { + vfu_log(vfu_ctx, LOG_ERR, "unsupported feature %d for SET", + feature); + return ERROR_INT(EINVAL); + } + break; + } + + default: { + /* + * PROBE allows GET/SET to also be set (to specify which operations + * we want to probe the feature for), so we only check that PROBE + * is set, not that it is the only operation flag set. + */ + if (!(operations & VFIO_DEVICE_FEATURE_PROBE)) { + vfu_log(vfu_ctx, LOG_ERR, "no operation specified"); + return ERROR_INT(EINVAL); + } + + msg->out.iov.iov_len = msg->in.iov.iov_len; + + if (req->argsz < msg->out.iov.iov_len) { + vfu_log(vfu_ctx, LOG_ERR, "bad argsz (%d<%ld)", req->argsz, + msg->out.iov.iov_len); + iov_free(&msg->out.iov); + return ERROR_INT(EINVAL); + } + + msg->out.iov.iov_base = malloc(msg->out.iov.iov_len); + + if (msg->out.iov.iov_base == NULL) { + return ERROR_INT(ENOMEM); + } + + memcpy(msg->out.iov.iov_base, msg->in.iov.iov_base, + msg->out.iov.iov_len); + + ret = 0; + } } return ret; @@ -1207,13 +1368,16 @@ handle_request(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ret = device_reset(vfu_ctx, VFU_RESET_DEVICE); break; - case VFIO_USER_DIRTY_PAGES: - // FIXME: don't allow migration calls if migration == NULL - if (vfu_ctx->dma != NULL) { - ret = handle_dirty_pages(vfu_ctx, msg); - } else { - ret = 0; - } + case VFIO_USER_DEVICE_FEATURE: + ret = handle_device_feature(vfu_ctx, msg); + break; + + case VFIO_USER_MIG_DATA_READ: + ret = handle_mig_data_read(vfu_ctx, msg); + break; + + case VFIO_USER_MIG_DATA_WRITE: + ret = handle_mig_data_write(vfu_ctx, msg); break; default: @@ -1317,7 +1481,8 @@ MOCK_DEFINE(cmd_allowed_when_stopped_and_copying)(uint16_t cmd) { return cmd == VFIO_USER_REGION_READ || cmd == VFIO_USER_REGION_WRITE || - cmd == VFIO_USER_DIRTY_PAGES; + cmd == VFIO_USER_DEVICE_FEATURE || + cmd == VFIO_USER_MIG_DATA_READ; } bool @@ -1343,14 +1508,14 @@ static bool access_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index, uint64_t offset) { - return access_migration_needs_quiesce(vfu_ctx, region_index, offset) - || access_is_pci_cap_exp(vfu_ctx, region_index, offset); + return access_is_pci_cap_exp(vfu_ctx, region_index, offset); } static bool command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg) { struct vfio_user_region_access *reg; + struct vfio_user_device_feature *feature; if (vfu_ctx->quiesce == NULL) { return false; @@ -1364,22 +1529,11 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg) case VFIO_USER_DEVICE_RESET: return true; - case VFIO_USER_DIRTY_PAGES: { - struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base; - - if (msg->in.iov.iov_len < sizeof(*dirty_pages)) { - return false; - } - - return !(dirty_pages->flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP); - } - case VFIO_USER_REGION_WRITE: if (msg->in.iov.iov_len < sizeof(*reg)) { /* * bad request, it will be eventually failed by * handle_region_access - * */ return false; } @@ -1388,8 +1542,23 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg) return true; } break; + + case VFIO_USER_DEVICE_FEATURE: + if (msg->in.iov.iov_len < sizeof(*feature)) { + /* + * bad request, it will be eventually failed by + * handle_region_access + */ + return false; + } + feature = msg->in.iov.iov_base; + if (migration_feature_needs_quiesce(feature)) { + return true; + } + break; } + return false; } @@ -1842,38 +2011,6 @@ copyin_mmap_areas(vfu_reg_info_t *reg_info, return 0; } -static bool -ranges_intersect(size_t off1, size_t size1, size_t off2, size_t size2) -{ - /* - * For two ranges to intersect, the start of each range must be before the - * end of the other range. - * TODO already defined in lib/pci_caps.c, maybe introduce a file for misc - * utility functions? - */ - return (off1 < (off2 + size2) && off2 < (off1 + size1)); -} - -static bool -maps_over_migr_regs(struct iovec *iov) -{ - return ranges_intersect(0, vfu_get_migr_register_area_size(), - (size_t)iov->iov_base, iov->iov_len); -} - -static bool -validate_sparse_mmaps_for_migr_reg(vfu_reg_info_t *reg) -{ - int i; - - for (i = 0; i < reg->nr_mmap_areas; i++) { - if (maps_over_migr_regs(®->mmap_areas[i])) { - return false; - } - } - return true; -} - EXPORT int vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size, vfu_region_access_cb_t *cb, int flags, @@ -1919,12 +2056,6 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size, return ERROR_INT(EINVAL); } - if (region_idx == VFU_PCI_DEV_MIGR_REGION_IDX && - size < vfu_get_migr_register_area_size()) { - vfu_log(vfu_ctx, LOG_ERR, "invalid migration region size %zu", size); - return ERROR_INT(EINVAL); - } - for (i = 0; i < nr_mmap_areas; i++) { struct iovec *iov = &mmap_areas[i]; if ((size_t)iov_end(iov) > size) { @@ -1956,15 +2087,6 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size, } } - if (region_idx == VFU_PCI_DEV_MIGR_REGION_IDX) { - if (!validate_sparse_mmaps_for_migr_reg(reg)) { - vfu_log(vfu_ctx, LOG_ERR, - "migration registers cannot be memory mapped"); - errno = EINVAL; - goto err; - } - } - return 0; err: @@ -2044,26 +2166,20 @@ vfu_setup_irq_state_callback(vfu_ctx_t *vfu_ctx, enum vfu_dev_irq_type type, EXPORT int vfu_setup_device_migration_callbacks(vfu_ctx_t *vfu_ctx, - const vfu_migration_callbacks_t *callbacks, - uint64_t data_offset) + const vfu_migration_callbacks_t *callbacks) { int ret = 0; assert(vfu_ctx != NULL); assert(callbacks != NULL); - if (vfu_ctx->reg_info[VFU_PCI_DEV_MIGR_REGION_IDX].size == 0) { - vfu_log(vfu_ctx, LOG_ERR, "no device migration region"); - return ERROR_INT(EINVAL); - } - if (callbacks->version != VFU_MIGR_CALLBACKS_VERS) { vfu_log(vfu_ctx, LOG_ERR, "unsupported migration callbacks version %d", callbacks->version); return ERROR_INT(EINVAL); } - vfu_ctx->migration = init_migration(callbacks, data_offset, &ret); + vfu_ctx->migration = init_migration(callbacks, &ret); if (vfu_ctx->migration == NULL) { vfu_log(vfu_ctx, LOG_ERR, "failed to initialize device migration"); return ERROR_INT(ret); diff --git a/lib/migration.c b/lib/migration.c index 794e7b81..02c29c19 100644 --- a/lib/migration.c +++ b/lib/migration.c @@ -39,17 +39,100 @@ #include "private.h" #include "migration_priv.h" +/* + * This defines valid migration state transitions. Each element in the array + * corresponds to a FROM state and each bit of the element to a TO state. If the + * bit is set, then the transition is allowed. + * + * The indices of each state are those in the vfio_user_device_mig_state enum. + */ +static const char transitions[VFIO_USER_DEVICE_NUM_STATES] = { + [VFIO_USER_DEVICE_STATE_ERROR] = 0, + [VFIO_USER_DEVICE_STATE_STOP] = (1 << VFIO_USER_DEVICE_STATE_RUNNING) | + (1 << VFIO_USER_DEVICE_STATE_STOP_COPY) | + (1 << VFIO_USER_DEVICE_STATE_RESUMING), + [VFIO_USER_DEVICE_STATE_RUNNING] = (1 << VFIO_USER_DEVICE_STATE_STOP) | + (1 << VFIO_USER_DEVICE_STATE_PRE_COPY), + [VFIO_USER_DEVICE_STATE_STOP_COPY] = 1 << VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RESUMING] = 1 << VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = 0, + [VFIO_USER_DEVICE_STATE_PRE_COPY] = (1 << VFIO_USER_DEVICE_STATE_RUNNING) | + (1 << VFIO_USER_DEVICE_STATE_STOP_COPY), + [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = 0 +}; + +/* + * The spec dictates that, if no direct transition is allowed, and the + * transition is not one of the explicitly disallowed ones (i.e. anything to + * ERROR, anything from ERROR, and STOP_COPY -> PRE_COPY), we should take the + * shortest allowed path. + * + * This can be indexed as `next_state[current][target] == next`. If next is + * ERROR, then the transition is not allowed. + */ +static const uint32_t +next_state[VFIO_USER_DEVICE_NUM_STATES][VFIO_USER_DEVICE_NUM_STATES] = { + [VFIO_USER_DEVICE_STATE_ERROR] = { 0, 0, 0, 0, 0, 0, 0, 0 }, + [VFIO_USER_DEVICE_STATE_STOP] = { + [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING, + [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY, + [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RESUMING, + [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_RUNNING, + [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + }, + [VFIO_USER_DEVICE_STATE_RUNNING] = { + [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING, + [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_PRE_COPY, + [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + }, + [VFIO_USER_DEVICE_STATE_STOP_COPY] = { + [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY, + [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + }, + [VFIO_USER_DEVICE_STATE_RESUMING] = { + [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RESUMING, + [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_STOP, + [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + }, + [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = { 0, 0, 0, 0, 0, 0, 0, 0 }, + [VFIO_USER_DEVICE_STATE_PRE_COPY] = { + [VFIO_USER_DEVICE_STATE_ERROR] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_STOP] = VFIO_USER_DEVICE_STATE_RUNNING, + [VFIO_USER_DEVICE_STATE_RUNNING] = VFIO_USER_DEVICE_STATE_RUNNING, + [VFIO_USER_DEVICE_STATE_STOP_COPY] = VFIO_USER_DEVICE_STATE_STOP_COPY, + [VFIO_USER_DEVICE_STATE_RESUMING] = VFIO_USER_DEVICE_STATE_RUNNING, + [VFIO_USER_DEVICE_STATE_RUNNING_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + [VFIO_USER_DEVICE_STATE_PRE_COPY] = VFIO_USER_DEVICE_STATE_PRE_COPY, + [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = VFIO_USER_DEVICE_STATE_ERROR, + }, + [VFIO_USER_DEVICE_STATE_PRE_COPY_P2P] = { 0, 0, 0, 0, 0, 0, 0, 0 }, +}; + bool MOCK_DEFINE(vfio_migr_state_transition_is_valid)(uint32_t from, uint32_t to) { - return migr_states[from].state & (1 << to); -} - -EXPORT size_t -vfu_get_migr_register_area_size(void) -{ - return ROUND_UP(sizeof(struct vfio_user_migration_info), - sysconf(_SC_PAGE_SIZE)); + return from < VFIO_USER_DEVICE_NUM_STATES + && to < VFIO_USER_DEVICE_NUM_STATES + && (transitions[from] & (1 << to)) != 0; } /* @@ -57,16 +140,10 @@ vfu_get_migr_register_area_size(void) * in vfu_ctx_t. */ struct migration * -init_migration(const vfu_migration_callbacks_t * callbacks, - uint64_t data_offset, int *err) +init_migration(const vfu_migration_callbacks_t *callbacks, int *err) { struct migration *migr; - if (data_offset < vfu_get_migr_register_area_size()) { - *err = EINVAL; - return NULL; - } - migr = calloc(1, sizeof(*migr)); if (migr == NULL) { *err = ENOMEM; @@ -81,15 +158,13 @@ init_migration(const vfu_migration_callbacks_t * callbacks, migr->pgsize = sysconf(_SC_PAGESIZE); /* FIXME this should be done in vfu_ctx_realize */ - migr->info.device_state = VFIO_DEVICE_STATE_V1_RUNNING; - migr->data_offset = data_offset; + migr->state = VFIO_USER_DEVICE_STATE_RUNNING; migr->callbacks = *callbacks; if (migr->callbacks.transition == NULL || - migr->callbacks.get_pending_bytes == NULL || - migr->callbacks.prepare_data == NULL || migr->callbacks.read_data == NULL || - migr->callbacks.write_data == NULL) { + migr->callbacks.write_data == NULL || + migr->callbacks.version != VFU_MIGR_CALLBACKS_VERS) { free(migr); *err = EINVAL; return NULL; @@ -100,35 +175,29 @@ init_migration(const vfu_migration_callbacks_t * callbacks, void MOCK_DEFINE(migr_state_transition)(struct migration *migr, - enum migr_iter_state state) + enum vfio_user_device_mig_state state) { assert(migr != NULL); - /* FIXME validate that state transition */ - migr->iter.state = state; + migr->state = state; } vfu_migr_state_t -MOCK_DEFINE(migr_state_vfio_to_vfu)(uint32_t device_state) +MOCK_DEFINE(migr_state_vfio_to_vfu)(uint32_t state) { - switch (device_state) { - case VFIO_DEVICE_STATE_V1_STOP: + switch (state) { + case VFIO_USER_DEVICE_STATE_STOP: return VFU_MIGR_STATE_STOP; - case VFIO_DEVICE_STATE_V1_RUNNING: + case VFIO_USER_DEVICE_STATE_RUNNING: return VFU_MIGR_STATE_RUNNING; - case VFIO_DEVICE_STATE_V1_SAVING: - /* - * FIXME How should the device operate during the stop-and-copy - * phase? Should we only allow the migration data to be read from - * the migration region? E.g. Access to any other region should be - * failed? This might be a good question to send to LKML. - */ + case VFIO_USER_DEVICE_STATE_STOP_COPY: return VFU_MIGR_STATE_STOP_AND_COPY; - case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING: - return VFU_MIGR_STATE_PRE_COPY; - case VFIO_DEVICE_STATE_V1_RESUMING: + case VFIO_USER_DEVICE_STATE_RESUMING: return VFU_MIGR_STATE_RESUME; + case VFIO_USER_DEVICE_STATE_PRE_COPY: + return VFU_MIGR_STATE_PRE_COPY; + default: + return -1; } - return -1; } /** @@ -165,8 +234,7 @@ MOCK_DEFINE(migr_trans_to_valid_state)(vfu_ctx_t *vfu_ctx, struct migration *mig return ret; } } - migr->info.device_state = device_state; - migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_INITIAL); + migr_state_transition(migr, device_state); return 0; } @@ -178,372 +246,176 @@ MOCK_DEFINE(handle_device_state)(vfu_ctx_t *vfu_ctx, struct migration *migr, uint32_t device_state, bool notify) { + assert(vfu_ctx != NULL); assert(migr != NULL); - if (!vfio_migr_state_transition_is_valid(migr->info.device_state, - device_state)) { + if (!vfio_migr_state_transition_is_valid(migr->state, device_state)) { return ERROR_INT(EINVAL); } return migr_trans_to_valid_state(vfu_ctx, migr, device_state, notify); } -/** - * Returns 0 on success, -1 on error setting errno. - */ -static ssize_t -handle_pending_bytes(vfu_ctx_t *vfu_ctx, struct migration *migr, - uint64_t *pending_bytes, bool is_write) +size_t +migration_get_state(vfu_ctx_t *vfu_ctx) { - assert(migr != NULL); - assert(pending_bytes != NULL); + return vfu_ctx->migration->state; +} - if (is_write) { +ssize_t +migration_set_state(vfu_ctx_t *vfu_ctx, uint32_t device_state) +{ + struct migration *migr = vfu_ctx->migration; + uint32_t state; + ssize_t ret = 0; + + if (device_state > VFIO_USER_DEVICE_NUM_STATES) { return ERROR_INT(EINVAL); } + + while (migr->state != device_state && ret == 0) { + state = next_state[migr->state][device_state]; - if (migr->iter.state == VFIO_USER_MIGR_ITER_STATE_FINISHED) { - *pending_bytes = 0; - return 0; - } - - switch (migr->iter.state) { - case VFIO_USER_MIGR_ITER_STATE_INITIAL: - case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED: - /* - * FIXME what happens if data haven't been consumed in the previous - * iteration? Check https://www.spinics.net/lists/kvm/msg228608.html. - */ - *pending_bytes = migr->iter.pending_bytes = migr->callbacks.get_pending_bytes(vfu_ctx); - - if (*pending_bytes == 0) { - migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_FINISHED); - } else { - migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_STARTED); - } - break; - case VFIO_USER_MIGR_ITER_STATE_STARTED: - /* - * FIXME We might be wrong returning a cached value, check - * https://www.spinics.net/lists/kvm/msg228608.html - * - */ - *pending_bytes = migr->iter.pending_bytes; - break; - default: + if (state == VFIO_USER_DEVICE_STATE_ERROR) { return ERROR_INT(EINVAL); - } - return 0; -} + } -/* - * FIXME reading or writing migration registers with the wrong device state or - * out of sequence is undefined, but should not result in EINVAL, it should - * simply be ignored. However this way it's easier to catch development errors. - * Make this behavior conditional. - */ + ret = handle_device_state(vfu_ctx, migr, state, true); + }; + + return ret; +} -/** - * Returns 0 on success, -1 on error setting errno. - */ -static ssize_t -handle_data_offset_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr, - bool is_write) +ssize_t +handle_mig_data_read(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) { - int ret = 0; - - assert(migr != NULL); + assert(vfu_ctx != NULL); + assert(msg != NULL); - if (is_write) { - vfu_log(vfu_ctx, LOG_ERR, "data_offset is RO when saving"); + if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data)) { + vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)", + msg->in.iov.iov_len); return ERROR_INT(EINVAL); } - switch (migr->iter.state) { - case VFIO_USER_MIGR_ITER_STATE_STARTED: - ret = migr->callbacks.prepare_data(vfu_ctx, &migr->iter.offset, - &migr->iter.size); - if (ret != 0) { - return ret; - } - /* - * FIXME must first read data_offset and then data_size. They way we've - * implemented it now, if data_size is read before data_offset we - * transition to state VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED without - * calling callbacks.prepare_data, which is wrong. Maybe we need - * separate states for data_offset and data_size. - */ - migr_state_transition(migr, VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED); - break; - case VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED: - /* - * data_offset is invariant during a save iteration. - */ - break; - default: - vfu_log(vfu_ctx, LOG_ERR, - "reading data_offset out of sequence is undefined"); + struct migration *migr = vfu_ctx->migration; + struct vfio_user_mig_data *req = msg->in.iov.iov_base; + + if (vfu_ctx->migration == NULL) { + vfu_log(vfu_ctx, LOG_ERR, "migration not enabled"); return ERROR_INT(EINVAL); } - return 0; -} - -/** - * Returns 0 on success, -1 on error setting errno. - */ -static ssize_t -handle_data_offset(vfu_ctx_t *vfu_ctx, struct migration *migr, - uint64_t *offset, bool is_write) -{ - int ret; - - assert(migr != NULL); - assert(offset != NULL); - - switch (migr->info.device_state) { - case VFIO_DEVICE_STATE_V1_SAVING: - case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING: - ret = handle_data_offset_when_saving(vfu_ctx, migr, is_write); - if (ret == 0 && !is_write) { - *offset = migr->iter.offset + migr->data_offset; - } - return ret; - case VFIO_DEVICE_STATE_V1_RESUMING: - if (is_write) { - /* TODO writing to read-only registers should be simply ignored */ - vfu_log(vfu_ctx, LOG_ERR, "bad write to migration data_offset"); - return ERROR_INT(EINVAL); - } - ret = migr->callbacks.prepare_data(vfu_ctx, offset, NULL); - if (ret != 0) { - return ret; - } - *offset += migr->data_offset; - return 0; + if (migr->state != VFIO_USER_DEVICE_STATE_PRE_COPY + && migr->state != VFIO_USER_DEVICE_STATE_STOP_COPY) { + vfu_log(vfu_ctx, LOG_ERR, "bad migration state to read data: %d", + migr->state); + return ERROR_INT(EINVAL); } - /* TODO improve error message */ - vfu_log(vfu_ctx, LOG_ERR, - "bad access to migration data_offset in state %s", - migr_states[migr->info.device_state].name); - return ERROR_INT(EINVAL); -} - -/** - * Returns 0 on success, -1 on failure setting errno. - */ -static ssize_t -handle_data_size_when_saving(vfu_ctx_t *vfu_ctx, struct migration *migr, - bool is_write) -{ - assert(migr != NULL); - if (is_write) { - /* TODO improve error message */ - vfu_log(vfu_ctx, LOG_ERR, "data_size is RO when saving"); + if (req->size > vfu_ctx->client_max_data_xfer_size) { + vfu_log(vfu_ctx, LOG_ERR, "transfer size exceeds limit (%d > %ld)", + req->size, vfu_ctx->client_max_data_xfer_size); return ERROR_INT(EINVAL); } - if (migr->iter.state != VFIO_USER_MIGR_ITER_STATE_STARTED && - migr->iter.state != VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED) { - vfu_log(vfu_ctx, LOG_ERR, - "reading data_size ouf of sequence is undefined"); + if (req->argsz < sizeof(struct vfio_user_mig_data) + req->size) { + vfu_log(vfu_ctx, LOG_ERR, "argsz too small (%d < %ld)", + req->argsz, sizeof(struct vfio_user_mig_data) + req->size); return ERROR_INT(EINVAL); } - return 0; -} -/** - * Returns 0 on success, -1 on error setting errno. - */ -static ssize_t -handle_data_size_when_resuming(vfu_ctx_t *vfu_ctx, struct migration *migr, - uint64_t size, bool is_write) -{ - assert(migr != NULL); + msg->out.iov.iov_len = msg->in.iov.iov_len + req->size; + msg->out.iov.iov_base = calloc(1, msg->out.iov.iov_len); - if (is_write) { - return migr->callbacks.data_written(vfu_ctx, size); + if (msg->out.iov.iov_base == NULL) { + return ERROR_INT(ENOMEM); } - return 0; -} -/** - * Returns 0 on success, -1 on failure setting errno. - */ -static ssize_t -handle_data_size(vfu_ctx_t *vfu_ctx, struct migration *migr, - uint64_t *size, bool is_write) -{ - int ret; + struct vfio_user_mig_data *res = msg->out.iov.iov_base; - assert(vfu_ctx != NULL); - assert(size != NULL); - - switch (migr->info.device_state){ - case VFIO_DEVICE_STATE_V1_SAVING: - case VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING: - ret = handle_data_size_when_saving(vfu_ctx, migr, is_write); - if (ret == 0 && !is_write) { - *size = migr->iter.size; - } + ssize_t ret = migr->callbacks.read_data(vfu_ctx, &res->data, req->size); + + if (ret < 0) { + vfu_log(vfu_ctx, LOG_ERR, "read_data callback failed, errno=%d", errno); + iov_free(&msg->out.iov); return ret; - case VFIO_DEVICE_STATE_V1_RESUMING: - return handle_data_size_when_resuming(vfu_ctx, migr, *size, is_write); } - /* TODO improve error message */ - vfu_log(vfu_ctx, LOG_ERR, "bad access to data_size"); - return ERROR_INT(EINVAL); + + res->size = ret; + res->argsz = sizeof(struct vfio_user_mig_data) + ret; + + return 0; } -/** - * Returns 0 on success, -1 on failure setting errno. - */ ssize_t -MOCK_DEFINE(migration_region_access_registers)(vfu_ctx_t *vfu_ctx, char *buf, - size_t count, loff_t pos, - bool is_write) +handle_mig_data_write(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) { + assert(vfu_ctx != NULL); + assert(msg != NULL); + + if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data)) { + vfu_log(vfu_ctx, LOG_ERR, "message too short (%ld)", + msg->in.iov.iov_len); + return ERROR_INT(EINVAL); + } + struct migration *migr = vfu_ctx->migration; - int ret; - uint32_t *device_state, old_device_state; + struct vfio_user_mig_data *req = msg->in.iov.iov_base; - assert(migr != NULL); + if (vfu_ctx->migration == NULL) { + vfu_log(vfu_ctx, LOG_ERR, "migration not enabled"); + return ERROR_INT(EINVAL); + } - switch (pos) { - case offsetof(struct vfio_user_migration_info, device_state): - if (count != sizeof(migr->info.device_state)) { - vfu_log(vfu_ctx, LOG_ERR, - "bad device_state access size %zu", count); - return ERROR_INT(EINVAL); - } - device_state = (uint32_t *)buf; - if (!is_write) { - *device_state = migr->info.device_state; - return 0; - } - old_device_state = migr->info.device_state; - vfu_log(vfu_ctx, LOG_DEBUG, - "migration: transitioning from state %s to state %s", - migr_states[old_device_state].name, - migr_states[*device_state].name); - - ret = handle_device_state(vfu_ctx, migr, *device_state, true); - if (ret == 0) { - vfu_log(vfu_ctx, LOG_DEBUG, - "migration: transitioned from state %s to state %s", - migr_states[old_device_state].name, - migr_states[*device_state].name); - } else { - vfu_log(vfu_ctx, LOG_ERR, - "migration: failed to transition from state %s to state %s", - migr_states[old_device_state].name, - migr_states[*device_state].name); - } - break; - case offsetof(struct vfio_user_migration_info, pending_bytes): - if (count != sizeof(migr->info.pending_bytes)) { - vfu_log(vfu_ctx, LOG_ERR, - "bad pending_bytes access size %zu", count); - return ERROR_INT(EINVAL); - } - ret = handle_pending_bytes(vfu_ctx, migr, (uint64_t *)buf, is_write); - break; - case offsetof(struct vfio_user_migration_info, data_offset): - if (count != sizeof(migr->info.data_offset)) { - vfu_log(vfu_ctx, LOG_ERR, - "bad data_offset access size %zu", count); - return ERROR_INT(EINVAL); - } - ret = handle_data_offset(vfu_ctx, migr, (uint64_t *)buf, is_write); - break; - case offsetof(struct vfio_user_migration_info, data_size): - if (count != sizeof(migr->info.data_size)) { - vfu_log(vfu_ctx, LOG_ERR, - "bad data_size access size %zu", count); - return ERROR_INT(EINVAL); - } - ret = handle_data_size(vfu_ctx, migr, (uint64_t *)buf, is_write); - break; - default: - vfu_log(vfu_ctx, LOG_ERR, - "bad migration region register offset %#llx", - (ull_t)pos); + if (migr->state != VFIO_USER_DEVICE_STATE_RESUMING) { + vfu_log(vfu_ctx, LOG_ERR, "bad migration state to write data: %d", + migr->state); return ERROR_INT(EINVAL); } - return ret; -} -ssize_t -migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count, - loff_t pos, bool is_write) -{ - struct migration *migr = vfu_ctx->migration; - ssize_t ret; + if (req->size > vfu_ctx->client_max_data_xfer_size) { + vfu_log(vfu_ctx, LOG_ERR, "transfer size exceeds limit (%d > %ld)", + req->size, vfu_ctx->client_max_data_xfer_size); + return ERROR_INT(EINVAL); + } - assert(migr != NULL); - assert(buf != NULL); + if (req->argsz < sizeof(struct vfio_user_mig_data) + req->size) { + vfu_log(vfu_ctx, LOG_ERR, "argsz too small (%d < %ld)", + req->argsz, sizeof(struct vfio_user_mig_data) + req->size); + return ERROR_INT(EINVAL); + } - /* - * FIXME don't call the device callback if the migration state is in not in - * pre-copy/stop-and-copy/resuming state, since the behavior is undefined - * in that case. - */ + if (msg->in.iov.iov_len < sizeof(struct vfio_user_mig_data) + req->size) { + vfu_log(vfu_ctx, LOG_ERR, "short write (%d < %ld)", + req->argsz, sizeof(struct vfio_user_mig_data) + req->size); + return ERROR_INT(EINVAL); + } - if (pos + count <= sizeof(struct vfio_user_migration_info)) { - ret = migration_region_access_registers(vfu_ctx, buf, count, - pos, is_write); - if (ret != 0) { - return ret; - } - } else { - - if (pos < (loff_t)migr->data_offset) { - /* - * TODO we can simply ignore the access to that part and handle - * any access to the data region properly. - */ - vfu_log(vfu_ctx, LOG_WARNING, - "bad access to dead space %#llx - %#llx in migration region", - (ull_t)pos, - (ull_t)(pos + count - 1)); - return ERROR_INT(EINVAL); - } + ssize_t ret = migr->callbacks.write_data(vfu_ctx, &req->data, req->size); - pos -= migr->data_offset; - if (is_write) { - ret = migr->callbacks.write_data(vfu_ctx, buf, count, pos); - if (ret < 0) { - return -1; - } - } else { - /* - * FIXME says: - * - * d. Read data_size bytes of data from (region + data_offset) from the - * migration region. - * - * Does this mean that partial reads are not allowed? - */ - ret = migr->callbacks.read_data(vfu_ctx, buf, count, pos); - if (ret < 0) { - return -1; - } - } + if (ret < 0) { + vfu_log(vfu_ctx, LOG_ERR, "write_data callback failed, errno=%d", + errno); + return ret; + } else if (ret != req->size) { + vfu_log(vfu_ctx, LOG_ERR, "migration data partial write of size=%ld", + ret); + return ERROR_INT(EINVAL); } - return count; + return 0; } bool MOCK_DEFINE(device_is_stopped_and_copying)(struct migration *migr) { - return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_SAVING; + return migr != NULL && migr->state == VFIO_USER_DEVICE_STATE_STOP_COPY; } bool MOCK_DEFINE(device_is_stopped)(struct migration *migr) { - return migr != NULL && migr->info.device_state == VFIO_DEVICE_STATE_V1_STOP; + return migr != NULL && migr->state == VFIO_USER_DEVICE_STATE_STOP; } size_t @@ -569,17 +441,11 @@ migration_set_pgsize(struct migration *migr, size_t pgsize) } bool -access_migration_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index, - uint64_t offset) +migration_feature_needs_quiesce(struct vfio_user_device_feature *feature) { - /* - * Writing to the migration state register with an unaligned access won't - * trigger this check but that's not a problem because - * migration_region_access_registers will fail the access. - */ - return region_index == VFU_PCI_DEV_MIGR_REGION_IDX - && vfu_ctx->migration != NULL - && offset == offsetof(struct vfio_user_migration_info, device_state); + return ((feature->flags & + (VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE)) != 0) + && !(feature->flags & VFIO_DEVICE_FEATURE_PROBE); } /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/migration.h b/lib/migration.h index 26fd744f..928a7e57 100644 --- a/lib/migration.h +++ b/lib/migration.h @@ -45,12 +45,19 @@ #include "private.h" struct migration * -init_migration(const vfu_migration_callbacks_t *callbacks, - uint64_t data_offset, int *err); +init_migration(const vfu_migration_callbacks_t *callbacks, int *err); + +size_t +migration_get_state(vfu_ctx_t *vfu_ctx); + +ssize_t +migration_set_state(vfu_ctx_t *vfu_ctx, uint32_t device_state); ssize_t -migration_region_access(vfu_ctx_t *vfu_ctx, char *buf, size_t count, - loff_t pos, bool is_write); +handle_mig_data_read(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg); + +ssize_t +handle_mig_data_write(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg); bool migration_available(vfu_ctx_t *vfu_ctx); @@ -65,6 +72,12 @@ migration_get_pgsize(struct migration *migr); int migration_set_pgsize(struct migration *migr, size_t pgsize); +uint64_t +migration_get_flags(struct migration *migr); + +MOCK_DECLARE(void, migr_state_transition, struct migration *migr, + enum vfio_user_device_mig_state state); + MOCK_DECLARE(bool, vfio_migr_state_transition_is_valid, uint32_t from, uint32_t to); @@ -72,8 +85,7 @@ MOCK_DECLARE(ssize_t, handle_device_state, vfu_ctx_t *vfu_ctx, struct migration *migr, uint32_t device_state, bool notify); bool -access_migration_needs_quiesce(const vfu_ctx_t *vfu_ctx, size_t region_index, - uint64_t offset); +migration_feature_needs_quiesce(struct vfio_user_device_feature *feature); #endif /* LIB_VFIO_USER_MIGRATION_H */ diff --git a/lib/migration_priv.h b/lib/migration_priv.h index d5643af2..83c5f7e5 100644 --- a/lib/migration_priv.h +++ b/lib/migration_priv.h @@ -33,94 +33,12 @@ #include -/* - * FSM to simplify saving device state. - */ -enum migr_iter_state { - VFIO_USER_MIGR_ITER_STATE_INITIAL, - VFIO_USER_MIGR_ITER_STATE_STARTED, - VFIO_USER_MIGR_ITER_STATE_DATA_PREPARED, - VFIO_USER_MIGR_ITER_STATE_FINISHED -}; - struct migration { - /* - * TODO if the user provides an FD then should mmap it and use the migration - * registers in the file - */ - struct vfio_user_migration_info info; + enum vfio_user_device_mig_state state; size_t pgsize; vfu_migration_callbacks_t callbacks; - uint64_t data_offset; - - /* - * This is only for the saving state. The resuming state is simpler so we - * don't need it. - */ - struct { - enum migr_iter_state state; - uint64_t pending_bytes; - uint64_t offset; - uint64_t size; - } iter; -}; - -struct migr_state_data { - uint32_t state; - const char *name; -}; - -#define VFIO_DEVICE_STATE_V1_ERROR (VFIO_DEVICE_STATE_V1_SAVING | VFIO_DEVICE_STATE_V1_RESUMING) - -/* valid migration state transitions */ -static const struct migr_state_data migr_states[(VFIO_DEVICE_STATE_MASK + 1)] = { - [VFIO_DEVICE_STATE_V1_STOP] = { - .state = - (1 << VFIO_DEVICE_STATE_V1_STOP) | - (1 << VFIO_DEVICE_STATE_V1_RUNNING), - .name = "stopped" - }, - [VFIO_DEVICE_STATE_V1_RUNNING] = { - .state = - (1 << VFIO_DEVICE_STATE_V1_STOP) | - (1 << VFIO_DEVICE_STATE_V1_RUNNING) | - (1 << VFIO_DEVICE_STATE_V1_SAVING) | - (1 << (VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING)) | - (1 << VFIO_DEVICE_STATE_V1_RESUMING) | - (1 << VFIO_DEVICE_STATE_V1_ERROR), - .name = "running" - }, - [VFIO_DEVICE_STATE_V1_SAVING] = { - .state = - (1 << VFIO_DEVICE_STATE_V1_STOP) | - (1 << VFIO_DEVICE_STATE_V1_RUNNING) | - (1 << VFIO_DEVICE_STATE_V1_SAVING) | - (1 << VFIO_DEVICE_STATE_V1_ERROR), - .name = "stop-and-copy" - }, - [VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING] = { - .state = - (1 << VFIO_DEVICE_STATE_V1_STOP) | - (1 << VFIO_DEVICE_STATE_V1_SAVING) | - (1 << VFIO_DEVICE_STATE_V1_RUNNING | VFIO_DEVICE_STATE_V1_SAVING) | - (1 << VFIO_DEVICE_STATE_V1_ERROR), - .name = "pre-copy" - }, - [VFIO_DEVICE_STATE_V1_RESUMING] = { - .state = - (1 << VFIO_DEVICE_STATE_V1_RUNNING) | - (1 << VFIO_DEVICE_STATE_V1_RESUMING) | - (1 << VFIO_DEVICE_STATE_V1_ERROR), - .name = "resuming" - } }; -MOCK_DECLARE(ssize_t, migration_region_access_registers, vfu_ctx_t *vfu_ctx, - char *buf, size_t count, loff_t pos, bool is_write); - -MOCK_DECLARE(void, migr_state_transition, struct migration *migr, - enum migr_iter_state state); - MOCK_DECLARE(vfu_migr_state_t, migr_state_vfio_to_vfu, uint32_t device_state); MOCK_DECLARE(int, state_trans_notify, vfu_ctx_t *vfu_ctx, @@ -129,4 +47,4 @@ MOCK_DECLARE(int, state_trans_notify, vfu_ctx_t *vfu_ctx, #endif -/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ \ No newline at end of file diff --git a/lib/private.h b/lib/private.h index fdd804f6..6e0170ed 100644 --- a/lib/private.h +++ b/lib/private.h @@ -195,20 +195,6 @@ typedef struct ioeventfd { LIST_ENTRY(ioeventfd) entry; } ioeventfd_t; -static inline int -ERROR_INT(int err) -{ - errno = err; - return -1; -} - -static inline void * -ERROR_PTR(int err) -{ - errno = err; - return NULL; -} - int consume_fd(int *fds, size_t nr_fds, size_t index); diff --git a/samples/client.c b/samples/client.c index ed66a302..e8b737f4 100644 --- a/samples/client.c +++ b/samples/client.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,8 @@ static char const *irq_to_str[] = { [VFU_DEV_REQ_IRQ] = "REQ" }; +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + struct client_dma_region { /* * Our DMA regions are one page in size so we only need one bit to mark them as @@ -121,12 +124,9 @@ send_version(int sock) "{" "\"capabilities\":{" "\"max_msg_fds\":%u," - "\"max_data_xfer_size\":%u," - "\"migration\":{" - "\"pgsize\":%ld" - "}" + "\"max_data_xfer_size\":%u" "}" - "}", CLIENT_MAX_FDS, CLIENT_MAX_DATA_XFER_SIZE, sysconf(_SC_PAGESIZE)); + "}", CLIENT_MAX_FDS, CLIENT_MAX_DATA_XFER_SIZE); cversion.major = LIB_VFIO_USER_MAJOR; cversion.minor = LIB_VFIO_USER_MINOR; @@ -225,14 +225,11 @@ send_device_reset(int sock) } } -/* returns whether a VFIO migration capability is found */ -static bool +static void get_region_vfio_caps(struct vfio_info_cap_header *header, struct vfio_region_info_cap_sparse_mmap **sparse) { - struct vfio_region_info_cap_type *type; unsigned int i; - bool migr = false; while (true) { switch (header->id) { @@ -247,16 +244,6 @@ get_region_vfio_caps(struct vfio_info_cap_header *header, (ull_t)(*sparse)->areas[i].size); } break; - case VFIO_REGION_INFO_CAP_TYPE: - type = (struct vfio_region_info_cap_type*)header; - if (type->type != VFIO_REGION_TYPE_MIGRATION || - type->subtype != VFIO_REGION_SUBTYPE_MIGRATION) { - errx(EXIT_FAILURE, "bad region type %d/%d", type->type, - type->subtype); - } - migr = true; - printf("client: migration region\n"); - break; default: errx(EXIT_FAILURE, "bad VFIO cap ID %#x", header->id); } @@ -265,7 +252,6 @@ get_region_vfio_caps(struct vfio_info_cap_header *header, } header = (struct vfio_info_cap_header*)((char*)header + header->next - sizeof(struct vfio_region_info)); } - return migr; } static void @@ -281,7 +267,7 @@ do_get_device_region_info(int sock, struct vfio_region_info *region_info, } static void -mmap_sparse_areas(int *fds, struct vfio_region_info *region_info, +mmap_sparse_areas(int fd, struct vfio_region_info *region_info, struct vfio_region_info_cap_sparse_mmap *sparse) { size_t i; @@ -293,14 +279,14 @@ mmap_sparse_areas(int *fds, struct vfio_region_info *region_info, char pathname[PATH_MAX]; char buf[PATH_MAX] = ""; - ret = snprintf(pathname, sizeof(pathname), "/proc/self/fd/%d", fds[i]); + ret = snprintf(pathname, sizeof(pathname), "/proc/self/fd/%d", fd); assert(ret != -1 && (size_t)ret < sizeof(pathname)); ret = readlink(pathname, buf, sizeof(buf) - 1); if (ret == -1) { - err(EXIT_FAILURE, "failed to resolve file descriptor %d", fds[i]); + err(EXIT_FAILURE, "failed to resolve file descriptor %d", fd); } addr = mmap(NULL, sparse->areas[i].size, PROT_READ | PROT_WRITE, - MAP_SHARED, fds[i], region_info->offset + + MAP_SHARED, fd, region_info->offset + sparse->areas[i].offset); if (addr == MAP_FAILED) { err(EXIT_FAILURE, @@ -357,16 +343,15 @@ get_device_region_info(int sock, uint32_t index) nr_fds); if (cap_sz) { struct vfio_region_info_cap_sparse_mmap *sparse = NULL; - if (get_region_vfio_caps((struct vfio_info_cap_header*)(region_info + 1), - &sparse)) { - if (sparse != NULL) { - assert((index == VFU_PCI_DEV_BAR1_REGION_IDX && nr_fds == 2) || - (index == VFU_PCI_DEV_MIGR_REGION_IDX && nr_fds == 1)); - assert(nr_fds == sparse->nr_areas); - mmap_sparse_areas(fds, region_info, sparse); - } + get_region_vfio_caps((struct vfio_info_cap_header*)(region_info + 1), + &sparse); + + if (sparse != NULL) { + assert(index == VFU_PCI_DEV_BAR1_REGION_IDX && nr_fds == 1); + mmap_sparse_areas(fds[0], region_info, sparse); + } else { + assert(index != VFU_PCI_DEV_BAR1_REGION_IDX); } - } free(region_info); } @@ -399,7 +384,7 @@ get_device_info(int sock, struct vfio_user_device_info *dev_info) err(EXIT_FAILURE, "failed to get device info"); } - if (dev_info->num_regions != 10) { + if (dev_info->num_regions != 9) { errx(EXIT_FAILURE, "bad number of device regions %d", dev_info->num_regions); } @@ -484,7 +469,6 @@ access_region(int sock, int region, bool is_write, uint64_t offset, .iov_len = data_len } }; - static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; struct vfio_user_region_access *recv_data; size_t nr_send_iovecs, recv_data_len; int op, ret; @@ -539,6 +523,123 @@ access_region(int sock, int region, bool is_write, uint64_t offset, return 0; } +static int +set_migration_state(int sock, uint32_t state) +{ + static int msg_id = 0xfab1; + struct vfio_user_device_feature req = { + .argsz = sizeof(struct vfio_user_device_feature) + + sizeof(struct vfio_user_device_feature_mig_state), + .flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE + }; + struct vfio_user_device_feature_mig_state change_state = { + .device_state = state, + .data_fd = -1 + }; + struct iovec send_iovecs[3] = { + [1] = { + .iov_base = &req, + .iov_len = sizeof(req) + }, + [2] = { + .iov_base = &change_state, + .iov_len = sizeof(change_state) + } + }; + void *response = alloca(sizeof(req) + sizeof(change_state)); + + if (response == NULL) { + return -1; + } + + pthread_mutex_lock(&mutex); + int ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_DEVICE_FEATURE, + send_iovecs, 3, NULL, 0, NULL, + response, sizeof(req) + sizeof(change_state), + NULL, 0); + pthread_mutex_unlock(&mutex); + + if (ret < 0) { + err(EXIT_FAILURE, "failed to set state: %d", ret); + } + + if (memcmp(&req, response, sizeof(req)) != 0) { + err(EXIT_FAILURE, "invalid response to set_migration_state (header)"); + } + + if (memcmp(&change_state, response + sizeof(req), + sizeof(change_state)) != 0) { + err(EXIT_FAILURE, "invalid response to set_migration_state (payload)"); + } + + return ret; +} + +static ssize_t +read_migr_data(int sock, void *buf, size_t len) +{ + static int msg_id = 0x6904; + struct vfio_user_mig_data req = { + .argsz = sizeof(struct vfio_user_mig_data) + len, + .size = len + }; + struct iovec send_iovecs[2] = { + [1] = { + .iov_base = &req, + .iov_len = sizeof(req) + } + }; + struct vfio_user_mig_data *res = calloc(1, sizeof(req) + len); + + assert(res != NULL); + + pthread_mutex_lock(&mutex); + ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_READ, + send_iovecs, 2, NULL, 0, NULL, + res, sizeof(req) + len, NULL, 0); + pthread_mutex_unlock(&mutex); + + if (ret < 0) { + err(EXIT_FAILURE, "failed to read migration data: %ld", ret); + } + + memcpy(buf, res->data, res->size); + + ssize_t size = res->size; + + free(res); + + return size; +} + +static ssize_t +write_migr_data(int sock, void *buf, size_t len) +{ + static int msg_id = 0x2023; + struct vfio_user_mig_data req = { + .argsz = sizeof(struct vfio_user_mig_data) + len, + .size = len + }; + struct iovec send_iovecs[3] = { + [1] = { + .iov_base = &req, + .iov_len = sizeof(req) + }, + [2] = { + .iov_base = buf, + .iov_len = len + } + }; + + pthread_mutex_lock(&mutex); + ssize_t ret = tran_sock_msg_iovec(sock, msg_id--, VFIO_USER_MIG_DATA_WRITE, + send_iovecs, 3, NULL, 0, NULL, + &req, sizeof(req), NULL, 0); + pthread_mutex_unlock(&mutex); + + return ret; +} + static void access_bar0(int sock, time_t *t) { @@ -712,34 +813,33 @@ static void get_dirty_bitmap(int sock, struct client_dma_region *dma_region, bool expect_dirty) { - uint64_t bitmap_size = _get_bitmap_size(dma_region->map.size, - sysconf(_SC_PAGESIZE)); - struct vfio_user_dirty_pages *dirty_pages; - struct vfio_user_bitmap_range *range; + struct vfio_user_device_feature *res; + struct vfio_user_device_feature_dma_logging_report *report; char *bitmap; - size_t size; - void *data; int ret; - size = sizeof(*dirty_pages) + sizeof(*range) + bitmap_size; + uint64_t bitmap_size = get_bitmap_size(dma_region->map.size, + sysconf(_SC_PAGESIZE)); - data = calloc(1, size); + size_t size = sizeof(*res) + sizeof(*report) + bitmap_size; + + void *data = calloc(1, size); assert(data != NULL); - dirty_pages = data; - dirty_pages->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; - dirty_pages->argsz = sizeof(*dirty_pages) + sizeof(*range) + bitmap_size; + res = data; + res->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT + | VFIO_DEVICE_FEATURE_GET; + res->argsz = size; - range = data + sizeof(*dirty_pages); - range->iova = dma_region->map.addr; - range->size = dma_region->map.size; - range->bitmap.size = bitmap_size; - range->bitmap.pgsize = sysconf(_SC_PAGESIZE); + report = (struct vfio_user_device_feature_dma_logging_report *)(res + 1); + report->iova = dma_region->map.addr; + report->length = dma_region->map.size; + report->page_size = sysconf(_SC_PAGESIZE); - bitmap = data + sizeof(*dirty_pages) + sizeof(*range); + bitmap = data + sizeof(*res) + sizeof(*report); - ret = tran_sock_msg(sock, 0x99, VFIO_USER_DIRTY_PAGES, - data, sizeof(*dirty_pages) + sizeof(*range), + ret = tran_sock_msg(sock, 0x99, VFIO_USER_DEVICE_FEATURE, + data, sizeof(*res) + sizeof(*report), NULL, data, size); if (ret != 0) { err(EXIT_FAILURE, "failed to get dirty page bitmap"); @@ -749,14 +849,14 @@ get_dirty_bitmap(int sock, struct client_dma_region *dma_region, char dirtied_by_client = (dma_region->flags & CLIENT_DIRTY_DMA_REGION) != 0; char dirtied = dirtied_by_server | dirtied_by_client; - printf("client: %s: %#llx-%#llx\t%#x\n", __func__, - (ull_t)range->iova, - (ull_t)(range->iova + range->size - 1), dirtied); - if (expect_dirty) { assert(dirtied); } + printf("client: %s: %#llx-%#llx\t%#x\n", __func__, + (ull_t)report->iova, + (ull_t)(report->iova + report->length - 1), dirtied); + free(data); } @@ -782,64 +882,32 @@ usage(char *argv0) * @returns the number of iterations performed */ static size_t -do_migrate(int sock, size_t nr_iters, struct iovec *migr_iter) +do_migrate(int sock, size_t nr_iters, size_t max_iter_size, + struct iovec *migr_iter) { - int ret; - uint64_t pending_bytes, data_offset, data_size; + ssize_t ret; size_t i = 0; - assert(nr_iters > 0); - - /* XXX read pending_bytes */ - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false, - offsetof(struct vfio_user_migration_info, pending_bytes), - &pending_bytes, sizeof(pending_bytes)); - if (ret < 0) { - err(EXIT_FAILURE, "failed to read pending_bytes"); - } - - for (i = 0; i < nr_iters && pending_bytes > 0; i++) { - - /* XXX read data_offset and data_size */ - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false, - offsetof(struct vfio_user_migration_info, data_offset), - &data_offset, sizeof(data_offset)); - if (ret < 0) { - err(EXIT_FAILURE, "failed to read data_offset"); - } + for (i = 0; i < nr_iters; i++) { - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false, - offsetof(struct vfio_user_migration_info, data_size), - &data_size, sizeof(data_size)); - if (ret < 0) { - err(EXIT_FAILURE, "failed to read data_size"); - } + migr_iter[i].iov_len = max_iter_size; + migr_iter[i].iov_base = malloc(migr_iter[i].iov_len); - migr_iter[i].iov_len = data_size; - migr_iter[i].iov_base = malloc(data_size); if (migr_iter[i].iov_base == NULL) { err(EXIT_FAILURE, "failed to allocate migration buffer"); } /* XXX read migration data */ - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false, - data_offset, - (char *)migr_iter[i].iov_base, data_size); + ret = read_migr_data(sock, migr_iter[i].iov_base, migr_iter[i].iov_len); if (ret < 0) { err(EXIT_FAILURE, "failed to read migration data"); } - /* FIXME send migration data to the destination client process */ + migr_iter[i].iov_len = ret; - /* - * XXX read pending_bytes again to indicate to the server that the - * migration data have been consumed. - */ - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false, - offsetof(struct vfio_user_migration_info, pending_bytes), - &pending_bytes, sizeof(pending_bytes)); - if (ret < 0) { - err(EXIT_FAILURE, "failed to read pending_bytes"); + // We know we've finished transferring data when we read 0 bytes. + if (ret == 0) { + break; } } return i; @@ -883,11 +951,12 @@ fake_guest(void *arg) static size_t migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters, - uint32_t *crcp, size_t bar1_size) + uint32_t *crcp, size_t bar1_size, size_t max_iter_size) { + size_t expected_data; uint32_t device_state; + size_t iters; int ret; - size_t _nr_iters; pthread_t thread; struct fake_guest_data fake_guest_data = { .sock = sock, @@ -902,7 +971,9 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters, err(EXIT_FAILURE, "failed to create pthread"); } - *nr_iters = 2; + expected_data = bar1_size; + *nr_iters = (expected_data + max_iter_size - 1) / max_iter_size; + assert(*nr_iters == 12); *migr_iters = malloc(sizeof(struct iovec) * *nr_iters); if (*migr_iters == NULL) { err(EXIT_FAILURE, NULL); @@ -912,16 +983,15 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters, * XXX set device state to pre-copy. This is technically optional but any * VMM that cares about performance needs this. */ - device_state = VFIO_DEVICE_STATE_V1_SAVING | VFIO_DEVICE_STATE_V1_RUNNING; - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true, - offsetof(struct vfio_user_migration_info, device_state), - &device_state, sizeof(device_state)); + device_state = VFIO_USER_DEVICE_STATE_PRE_COPY; + ret = set_migration_state(sock, device_state); if (ret < 0) { err(EXIT_FAILURE, "failed to write to device state"); } - _nr_iters = do_migrate(sock, 1, *migr_iters); - assert(_nr_iters == 1); + iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters); + assert(iters == *nr_iters); + printf("client: stopping fake guest thread\n"); fake_guest_data.done = true; __sync_synchronize(); @@ -933,31 +1003,32 @@ migrate_from(int sock, size_t *nr_iters, struct iovec **migr_iters, printf("client: setting device state to stop-and-copy\n"); - device_state = VFIO_DEVICE_STATE_V1_SAVING; - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true, - offsetof(struct vfio_user_migration_info, device_state), - &device_state, sizeof(device_state)); + device_state = VFIO_USER_DEVICE_STATE_STOP_COPY; + ret = set_migration_state(sock, device_state); if (ret < 0) { err(EXIT_FAILURE, "failed to write to device state"); } - _nr_iters += do_migrate(sock, 1, (*migr_iters) + _nr_iters); - if (_nr_iters != 2) { - errx(EXIT_FAILURE, - "expected 2 iterations instead of %zu while in stop-and-copy state", - _nr_iters); + expected_data = bar1_size + sizeof(time_t); + *nr_iters = (expected_data + max_iter_size - 1) / max_iter_size; + assert(*nr_iters == 13); + free(*migr_iters); + *migr_iters = malloc(sizeof(struct iovec) * *nr_iters); + if (*migr_iters == NULL) { + err(EXIT_FAILURE, NULL); } + iters = do_migrate(sock, *nr_iters, max_iter_size, *migr_iters); + assert(iters == *nr_iters); + /* XXX read device state, migration must have finished now */ - device_state = VFIO_DEVICE_STATE_V1_STOP; - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true, - offsetof(struct vfio_user_migration_info, device_state), - &device_state, sizeof(device_state)); + device_state = VFIO_USER_DEVICE_STATE_STOP; + ret = set_migration_state(sock, device_state); if (ret < 0) { err(EXIT_FAILURE, "failed to write to device state"); } - return _nr_iters; + return iters; } static int @@ -966,11 +1037,11 @@ migrate_to(char *old_sock_path, int *server_max_fds, struct iovec *migr_iters, char *path_to_server, uint32_t src_crc, size_t bar1_size) { - int ret, sock; + ssize_t ret; + int sock; char *sock_path; struct stat sb; - uint32_t device_state = VFIO_DEVICE_STATE_V1_RESUMING; - uint64_t data_offset, data_len; + uint32_t device_state = VFIO_USER_DEVICE_STATE_RESUMING; size_t i; uint32_t dst_crc; char buf[bar1_size]; @@ -1020,57 +1091,26 @@ migrate_to(char *old_sock_path, int *server_max_fds, negotiate(sock, server_max_fds, server_max_data_xfer_size, pgsize); - /* XXX set device state to resuming */ - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true, - offsetof(struct vfio_user_migration_info, device_state), - &device_state, sizeof(device_state)); + device_state = VFIO_USER_DEVICE_STATE_RESUMING; + ret = set_migration_state(sock, device_state); if (ret < 0) { err(EXIT_FAILURE, "failed to set device state to resuming"); } for (i = 0; i < nr_iters; i++) { - - /* XXX read data offset */ - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, false, - offsetof(struct vfio_user_migration_info, data_offset), - &data_offset, sizeof(data_offset)); - if (ret < 0) { - err(EXIT_FAILURE, "failed to read migration data offset"); - } - /* XXX write migration data */ - - /* - * TODO write half of migration data via regular write and other half via - * memopy map. - */ - printf("client: writing migration device data %#llx-%#llx\n", - (ull_t)data_offset, - (ull_t)(data_offset + migr_iters[i].iov_len - 1)); - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true, - data_offset, migr_iters[i].iov_base, - migr_iters[i].iov_len); + ret = write_migr_data(sock, migr_iters[i].iov_base, + migr_iters[i].iov_len); if (ret < 0) { err(EXIT_FAILURE, "failed to write device migration data"); } - - /* XXX write data_size */ - data_len = migr_iters[i].iov_len; - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true, - offsetof(struct vfio_user_migration_info, data_size), - &data_len, sizeof(data_len)); - if (ret < 0) { - err(EXIT_FAILURE, "failed to write migration data size"); - } } - /* XXX set device state to running */ - device_state = VFIO_DEVICE_STATE_V1_RUNNING; - ret = access_region(sock, VFU_PCI_DEV_MIGR_REGION_IDX, true, - offsetof(struct vfio_user_migration_info, device_state), - &device_state, sizeof(device_state)); + /* XXX set device state to stop to finish the transfer */ + device_state = VFIO_USER_DEVICE_STATE_STOP; + ret = set_migration_state(sock, device_state); if (ret < 0) { - err(EXIT_FAILURE, "failed to set device state to running"); + err(EXIT_FAILURE, "failed to set device state to stop"); } /* validate contents of BAR1 */ @@ -1086,6 +1126,13 @@ migrate_to(char *old_sock_path, int *server_max_fds, abort(); } + /* XXX set device state to running */ + device_state = VFIO_USER_DEVICE_STATE_RUNNING; + ret = set_migration_state(sock, device_state); + if (ret < 0) { + err(EXIT_FAILURE, "failed to set device state to running"); + } + return sock; } @@ -1125,7 +1172,6 @@ int main(int argc, char *argv[]) size_t server_max_data_xfer_size; size_t pgsize; int nr_dma_regions; - struct vfio_user_dirty_pages dirty_pages = {0}; int opt; time_t t; char *path_to_server = NULL; @@ -1135,6 +1181,14 @@ int main(int argc, char *argv[]) uint32_t crc; size_t bar1_size = 0x3000; /* FIXME get this value from region info */ + struct vfio_user_device_feature *dirty_pages_feature; + struct vfio_user_device_feature_dma_logging_control *dirty_pages_control; + size_t dirty_pages_size = sizeof(*dirty_pages_feature) + + sizeof(*dirty_pages_control); + void *dirty_pages = malloc(dirty_pages_size); + dirty_pages_feature = dirty_pages; + dirty_pages_control = (void *)(dirty_pages_feature + 1); + while ((opt = getopt(argc, argv, "h")) != -1) { switch (opt) { case 'h': @@ -1229,11 +1283,16 @@ int main(int argc, char *argv[]) */ irq_fd = configure_irqs(sock); - dirty_pages.argsz = sizeof(dirty_pages); - dirty_pages.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; - ret = tran_sock_msg(sock, 0, VFIO_USER_DIRTY_PAGES, - &dirty_pages, sizeof(dirty_pages), - NULL, NULL, 0); + /* start dirty pages logging */ + dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) + + sizeof(*dirty_pages_control); + dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_START | + VFIO_DEVICE_FEATURE_SET; + dirty_pages_control->num_ranges = 0; + dirty_pages_control->page_size = sysconf(_SC_PAGESIZE); + + ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages, + dirty_pages_size, NULL, dirty_pages, dirty_pages_size); if (ret != 0) { err(EXIT_FAILURE, "failed to start dirty page logging"); } @@ -1270,11 +1329,16 @@ int main(int argc, char *argv[]) get_dirty_bitmap(sock, &dma_regions[i], i < 2); } - dirty_pages.argsz = sizeof(dirty_pages); - dirty_pages.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; - ret = tran_sock_msg(sock, 0, VFIO_USER_DIRTY_PAGES, - &dirty_pages, sizeof(dirty_pages), - NULL, NULL, 0); + /* stop logging dirty pages */ + dirty_pages_feature->argsz = sizeof(*dirty_pages_feature) + + sizeof(*dirty_pages_control); + dirty_pages_feature->flags = VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP | + VFIO_DEVICE_FEATURE_SET; + dirty_pages_control->num_ranges = 0; + dirty_pages_control->page_size = sysconf(_SC_PAGESIZE); + + ret = tran_sock_msg(sock, 0, VFIO_USER_DEVICE_FEATURE, dirty_pages, + dirty_pages_size, NULL, dirty_pages, dirty_pages_size); if (ret != 0) { err(EXIT_FAILURE, "failed to stop dirty page logging"); } @@ -1316,7 +1380,8 @@ int main(int argc, char *argv[]) err(EXIT_FAILURE, "failed to write to BAR0"); } - nr_iters = migrate_from(sock, &nr_iters, &migr_iters, &crc, bar1_size); + nr_iters = migrate_from(sock, &nr_iters, &migr_iters, &crc, bar1_size, + MIN(server_max_data_xfer_size, CLIENT_MAX_DATA_XFER_SIZE)); /* * Normally the client would now send the device state to the destination @@ -1374,6 +1439,7 @@ int main(int argc, char *argv[]) } free(dma_regions); + free(dirty_pages); return 0; } diff --git a/samples/gpio-pci-idio-16.c b/samples/gpio-pci-idio-16.c index b50f407b..6c4e99b8 100644 --- a/samples/gpio-pci-idio-16.c +++ b/samples/gpio-pci-idio-16.c @@ -77,49 +77,23 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) return 0; } -static uint64_t -migration_get_pending_bytes(UNUSED vfu_ctx_t *vfu_ctx) +static ssize_t +migration_read_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, uint64_t size) { + assert(size == sizeof(pin)); + if (dirty) { + memcpy(buf, &pin, sizeof(pin)); + dirty = false; return sizeof(pin); } - return 0; -} -static int -migration_prepare_data(UNUSED vfu_ctx_t *vfu_ctx, - uint64_t *offset, uint64_t *size) -{ - *offset = 0; - if (size != NULL) { /* null means resuming */ - *size = sizeof(pin); - } return 0; } static ssize_t -migration_read_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, - uint64_t size, uint64_t offset) +migration_write_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, uint64_t size) { - assert(offset == 0); - assert(size == sizeof(pin)); - memcpy(buf, &pin, sizeof(pin)); - dirty = false; - return 0; -} - -static int -migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, uint64_t count) -{ - assert(count == sizeof(pin)); - return 0; -} - -static ssize_t -migration_write_data(UNUSED vfu_ctx_t *vfu_ctx, void *buf, - uint64_t size, uint64_t offset) -{ - assert(offset == 0); assert(size == sizeof(pin)); memcpy(&pin, buf, sizeof(pin)); return 0; @@ -145,16 +119,10 @@ main(int argc, char *argv[]) int opt; struct sigaction act = { .sa_handler = _sa_handler }; vfu_ctx_t *vfu_ctx; - size_t migr_regs_size = vfu_get_migr_register_area_size(); - size_t migr_data_size = sysconf(_SC_PAGE_SIZE); - size_t migr_size = migr_regs_size + migr_data_size; const vfu_migration_callbacks_t migr_callbacks = { .version = VFU_MIGR_CALLBACKS_VERS, .transition = &migration_device_state_transition, - .get_pending_bytes = &migration_get_pending_bytes, - .prepare_data = &migration_prepare_data, .read_data = &migration_read_data, - .data_written = &migration_data_written, .write_data = &migration_write_data }; @@ -214,13 +182,7 @@ main(int argc, char *argv[]) } if (enable_migr) { - ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, migr_size, - NULL, VFU_REGION_FLAG_RW, NULL, 0, -1, 0); - if (ret < 0) { - err(EXIT_FAILURE, "failed to setup migration region"); - } - ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, - migr_regs_size); + ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks); if (ret < 0) { err(EXIT_FAILURE, "failed to setup device migration"); } diff --git a/samples/server.c b/samples/server.c index 565974d2..5edf6746 100644 --- a/samples/server.c +++ b/samples/server.c @@ -60,7 +60,7 @@ struct server_data { size_t bar1_size; struct dma_regions regions[NR_DMA_REGIONS]; struct { - uint64_t pending_bytes; + uint64_t bytes_transferred; vfu_migr_state_t state; } migration; }; @@ -130,10 +130,6 @@ bar1_access(vfu_ctx_t *vfu_ctx, char * const buf, } if (is_write) { - if (server_data->migration.state == VFU_MIGR_STATE_PRE_COPY) { - /* dirty the whole thing */ - server_data->migration.pending_bytes = server_data->bar1_size; - } memcpy(server_data->bar1 + offset, buf, count); } else { memcpy(buf, server_data->bar1, count); @@ -322,19 +318,24 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) if (setitimer(ITIMER_REAL, &new, NULL) != 0) { err(EXIT_FAILURE, "failed to disable timer"); } - server_data->migration.pending_bytes = server_data->bar1_size + sizeof(time_t); /* FIXME BAR0 region size */ + server_data->migration.bytes_transferred = 0; break; case VFU_MIGR_STATE_PRE_COPY: - /* TODO must be less than size of data region in migration region */ - server_data->migration.pending_bytes = server_data->bar1_size; + server_data->migration.bytes_transferred = 0; break; case VFU_MIGR_STATE_STOP: /* FIXME should gracefully fail */ - assert(server_data->migration.pending_bytes == 0); + if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) { + assert(server_data->migration.bytes_transferred == + server_data->bar1_size + sizeof(time_t)); + } break; case VFU_MIGR_STATE_RESUME: + server_data->migration.bytes_transferred = 0; break; case VFU_MIGR_STATE_RUNNING: + assert(server_data->migration.bytes_transferred == + server_data->bar1_size + sizeof(time_t)); ret = arm_timer(vfu_ctx, server_data->bar0); if (ret < 0) { return ret; @@ -347,125 +348,100 @@ migration_device_state_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state) return 0; } -static uint64_t -migration_get_pending_bytes(vfu_ctx_t *vfu_ctx) -{ - struct server_data *server_data = vfu_get_private(vfu_ctx); - return server_data->migration.pending_bytes; -} - -static int -migration_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset, uint64_t *size) -{ - struct server_data *server_data = vfu_get_private(vfu_ctx); - - *offset = 0; - if (size != NULL) { - *size = server_data->migration.pending_bytes; - } - return 0; -} - static ssize_t -migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, - uint64_t size, uint64_t offset) +migration_read_data(vfu_ctx_t *vfu_ctx, void *buf, uint64_t size) { struct server_data *server_data = vfu_get_private(vfu_ctx); - if (server_data->migration.state != VFU_MIGR_STATE_PRE_COPY && - server_data->migration.state != VFU_MIGR_STATE_STOP_AND_COPY) - { - return size; - } - /* - * For ease of implementation we expect the client to read all migration - * data in one go; partial reads are not supported. This is allowed by VFIO - * however we don't yet support it. Similarly, when resuming, partial - * writes are supported by VFIO, however we don't in this sample. - * * If in pre-copy state we copy BAR1, if in stop-and-copy state we copy * both BAR1 and BAR0. Since we always copy BAR1 in the stop-and-copy state, * copying BAR1 in the pre-copy state is pointless. Fixing this requires * more complex state tracking which exceeds the scope of this sample. */ - if (offset != 0 || size != server_data->migration.pending_bytes) { - errno = EINVAL; - return -1; - } + uint32_t total_to_read = server_data->bar1_size; - memcpy(buf, server_data->bar1, server_data->bar1_size); if (server_data->migration.state == VFU_MIGR_STATE_STOP_AND_COPY) { - memcpy(buf + server_data->bar1_size, &server_data->bar0, - sizeof(server_data->bar0)); + total_to_read += sizeof(server_data->bar0); + } + + if (server_data->migration.bytes_transferred == total_to_read || size == 0) { + vfu_log(vfu_ctx, LOG_DEBUG, "no data left to read"); + return 0; + } + + uint32_t read_start = server_data->migration.bytes_transferred; + uint32_t read_end = MIN(read_start + size, total_to_read); + assert(read_end > read_start); + + uint32_t bytes_read = read_end - read_start; + + uint32_t length_in_bar1 = 0; + uint32_t length_in_bar0 = 0; + + /* read bar1, if any */ + if (read_start < server_data->bar1_size) { + length_in_bar1 = MIN(bytes_read, server_data->bar1_size - read_start); + memcpy(buf, server_data->bar1 + read_start, length_in_bar1); + read_start += length_in_bar1; + } + + /* read bar0, if any */ + if (read_end > server_data->bar1_size) { + length_in_bar0 = read_end - read_start; + read_start -= server_data->bar1_size; + memcpy(buf + length_in_bar1, &server_data->bar0 + read_start, + length_in_bar0); } - server_data->migration.pending_bytes = 0; - return size; + server_data->migration.bytes_transferred += bytes_read; + + return bytes_read; } static ssize_t -migration_write_data(vfu_ctx_t *vfu_ctx, void *data, - uint64_t size, uint64_t offset) +migration_write_data(vfu_ctx_t *vfu_ctx, void *data, uint64_t size) { struct server_data *server_data = vfu_get_private(vfu_ctx); char *buf = data; - int ret; assert(server_data != NULL); assert(data != NULL); - if (offset != 0 || size < server_data->bar1_size) { - vfu_log(vfu_ctx, LOG_DEBUG, "XXX bad migration data write %#llx-%#llx", - (unsigned long long)offset, - (unsigned long long)offset + size - 1); - errno = EINVAL; - return -1; - } + uint32_t total_to_write = server_data->bar1_size + sizeof(server_data->bar0); - memcpy(server_data->bar1, buf, server_data->bar1_size); - buf += server_data->bar1_size; - size -= server_data->bar1_size; - if (size == 0) { + if (server_data->migration.bytes_transferred == total_to_write || size == 0) { return 0; } - if (size != sizeof(server_data->bar0)) { - errno = EINVAL; - return -1; - } - memcpy(&server_data->bar0, buf, sizeof(server_data->bar0)); - ret = bar0_access(vfu_ctx, buf, sizeof(server_data->bar0), 0, true); - assert(ret == (int)size); /* FIXME */ - return 0; -} + uint32_t write_start = server_data->migration.bytes_transferred; + uint32_t write_end = MIN(write_start + size, total_to_write); // exclusive + assert(write_end > write_start); + uint32_t bytes_written = write_end - write_start; -static int -migration_data_written(UNUSED vfu_ctx_t *vfu_ctx, UNUSED uint64_t count) -{ - /* - * We apply migration state directly in the migration_write_data callback, - * so we don't need to do anything here. We would have to apply migration - * state in this callback if the migration region was memory mappable, in - * which case we wouldn't know when the client wrote migration data. - */ + uint32_t length_in_bar1 = 0; + uint32_t length_in_bar0 = 0; - return 0; -} + /* write to bar1, if any */ + if (write_start < server_data->bar1_size) { + length_in_bar1 = MIN(bytes_written, server_data->bar1_size - write_start); + memcpy(server_data->bar1 + write_start, buf, length_in_bar1); + write_start += length_in_bar1; + } -static size_t -nr_pages(size_t size) -{ - return (size / sysconf(_SC_PAGE_SIZE) + - (size % sysconf(_SC_PAGE_SIZE) > 1)); -} + /* write to bar0, if any */ + if (write_end > server_data->bar1_size) { + length_in_bar0 = write_end - write_start; + write_start -= server_data->bar1_size; + memcpy(&server_data->bar0 + write_start, buf + length_in_bar1, + length_in_bar0); + } -static size_t -page_align(size_t size) -{ - return nr_pages(size) * sysconf(_SC_PAGE_SIZE); + server_data->migration.bytes_transferred += bytes_written; + + return bytes_written; } int main(int argc, char *argv[]) @@ -476,7 +452,6 @@ int main(int argc, char *argv[]) int opt; struct sigaction act = {.sa_handler = _sa_handler}; const size_t bar1_size = 0x3000; - size_t migr_regs_size, migr_data_size, migr_size; struct server_data server_data = { .migration = { .state = VFU_MIGR_STATE_RUNNING @@ -488,10 +463,7 @@ int main(int argc, char *argv[]) const vfu_migration_callbacks_t migr_callbacks = { .version = VFU_MIGR_CALLBACKS_VERS, .transition = &migration_device_state_transition, - .get_pending_bytes = &migration_get_pending_bytes, - .prepare_data = &migration_prepare_data, .read_data = &migration_read_data, - .data_written = &migration_data_written, .write_data = &migration_write_data }; @@ -550,9 +522,6 @@ int main(int argc, char *argv[]) * are mappable. The client can still mmap the 2nd page, we can't prohibit * this under Linux. If we really want to prohibit it we have to use * separate files for the same region. - * - * We choose to use a single file which contains both BAR1 and the migration - * registers. They could also be completely different files. */ if ((tmpfd = mkstemp(template)) == -1) { err(EXIT_FAILURE, "failed to create backing file"); @@ -562,16 +531,7 @@ int main(int argc, char *argv[]) server_data.bar1_size = bar1_size; - /* - * The migration registers aren't memory mappable, so in order to make the - * rest of the migration region memory mappable we must effectively reserve - * an entire page. - */ - migr_regs_size = vfu_get_migr_register_area_size(); - migr_data_size = page_align(bar1_size + sizeof(time_t)); - migr_size = migr_regs_size + migr_data_size; - - if (ftruncate(tmpfd, server_data.bar1_size + migr_size) == -1) { + if (ftruncate(tmpfd, server_data.bar1_size) == -1) { err(EXIT_FAILURE, "failed to truncate backing file"); } server_data.bar1 = mmap(NULL, server_data.bar1_size, PROT_READ | PROT_WRITE, @@ -591,29 +551,8 @@ int main(int argc, char *argv[]) err(EXIT_FAILURE, "failed to setup BAR1 region"); } - /* setup migration */ - - struct iovec migr_mmap_areas[] = { - [0] = { - .iov_base = (void *)migr_regs_size, - .iov_len = migr_data_size - }, - }; - - /* - * The migration region comes after bar1 in the backing file, so offset is - * server_data.bar1_size. - */ - ret = vfu_setup_region(vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX, migr_size, - NULL, VFU_REGION_FLAG_RW, migr_mmap_areas, - ARRAY_SIZE(migr_mmap_areas), tmpfd, - server_data.bar1_size); - if (ret < 0) { - err(EXIT_FAILURE, "failed to setup migration region"); - } - - ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks, - migr_regs_size); + ret = vfu_setup_device_migration_callbacks(vfu_ctx, &migr_callbacks); + if (ret < 0) { err(EXIT_FAILURE, "failed to setup device migration"); } diff --git a/test/mocks.c b/test/mocks.c index 2ae14b4a..ce3060fd 100644 --- a/test/mocks.c +++ b/test/mocks.c @@ -199,23 +199,6 @@ should_exec_command(vfu_ctx_t *vfu_ctx, uint16_t cmd) return mock(); } -ssize_t -migration_region_access_registers(vfu_ctx_t *vfu_ctx, char *buf, size_t count, - loff_t pos, bool is_write) -{ - if (!is_patched("migration_region_access_registers")) { - return __real_migration_region_access_registers(vfu_ctx, buf, count, - pos, is_write); - } - check_expected(vfu_ctx); - check_expected(buf); - check_expected(count); - check_expected(pos); - check_expected(is_write); - errno = mock(); - return mock(); -} - ssize_t handle_device_state(vfu_ctx_t *vfu_ctx, struct migration *migr, uint32_t device_state, bool notify) { @@ -232,7 +215,8 @@ handle_device_state(vfu_ctx_t *vfu_ctx, struct migration *migr, } void -migr_state_transition(struct migration *migr, enum migr_iter_state state) +migr_state_transition(struct migration *migr, + enum vfio_user_device_mig_state state) { if (!is_patched("migr_state_transition")) { __real_migr_state_transition(migr, state); diff --git a/test/py/libvfio_user.py b/test/py/libvfio_user.py index a701d1b7..289f10a1 100644 --- a/test/py/libvfio_user.py +++ b/test/py/libvfio_user.py @@ -43,7 +43,6 @@ import struct import syslog import copy -import tempfile import sys from resource import getpagesize from math import log2 @@ -126,12 +125,6 @@ VFIO_DMA_UNMAP_FLAG_ALL = (1 << 1) -VFIO_DEVICE_STATE_V1_STOP = (0) -VFIO_DEVICE_STATE_V1_RUNNING = (1 << 0) -VFIO_DEVICE_STATE_V1_SAVING = (1 << 1) -VFIO_DEVICE_STATE_V1_RESUMING = (1 << 2) -VFIO_DEVICE_STATE_MASK = ((1 << 3) - 1) - # libvfio-user defines @@ -178,8 +171,11 @@ def is_32bit(): VFIO_USER_DMA_READ = 11 VFIO_USER_DMA_WRITE = 12 VFIO_USER_DEVICE_RESET = 13 -VFIO_USER_DIRTY_PAGES = 14 -VFIO_USER_MAX = 15 +VFIO_USER_REGION_WRITE_MULTI = 15 +VFIO_USER_DEVICE_FEATURE = 16 +VFIO_USER_MIG_DATA_READ = 17 +VFIO_USER_MIG_DATA_WRITE = 18 +VFIO_USER_MAX = 19 VFIO_USER_F_TYPE = 0xf VFIO_USER_F_TYPE_COMMAND = 0 @@ -198,8 +194,7 @@ def is_32bit(): VFU_PCI_DEV_ROM_REGION_IDX = 6 VFU_PCI_DEV_CFG_REGION_IDX = 7 VFU_PCI_DEV_VGA_REGION_IDX = 8 -VFU_PCI_DEV_MIGR_REGION_IDX = 9 -VFU_PCI_DEV_NUM_REGIONS = 10 +VFU_PCI_DEV_NUM_REGIONS = 9 VFU_REGION_FLAG_READ = 1 VFU_REGION_FLAG_WRITE = 2 @@ -212,14 +207,42 @@ def is_32bit(): VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP = (1 << 0) -VFIO_IOMMU_DIRTY_PAGES_FLAG_START = (1 << 0) -VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP = (1 << 1) -VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP = (1 << 2) +# enum vfio_user_device_mig_state +VFIO_USER_DEVICE_STATE_ERROR = 0 +VFIO_USER_DEVICE_STATE_STOP = 1 +VFIO_USER_DEVICE_STATE_RUNNING = 2 +VFIO_USER_DEVICE_STATE_STOP_COPY = 3 +VFIO_USER_DEVICE_STATE_RESUMING = 4 +VFIO_USER_DEVICE_STATE_RUNNING_P2P = 5 +VFIO_USER_DEVICE_STATE_PRE_COPY = 6 +VFIO_USER_DEVICE_STATE_PRE_COPY_P2P = 7 + +VFIO_DEVICE_FEATURE_MASK = 0xffff +VFIO_DEVICE_FEATURE_GET = (1 << 16) +VFIO_DEVICE_FEATURE_SET = (1 << 17) +VFIO_DEVICE_FEATURE_PROBE = (1 << 18) + +VFIO_DEVICE_FEATURE_MIGRATION = 1 +VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE = 2 +VFIO_DEVICE_FEATURE_DMA_LOGGING_START = 6 +VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP = 7 +VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT = 8 + +VFIO_MIGRATION_STOP_COPY = (1 << 0) +VFIO_MIGRATION_P2P = (1 << 1) +VFIO_MIGRATION_PRE_COPY = (1 << 2) VFIO_USER_IO_FD_TYPE_IOEVENTFD = 0 VFIO_USER_IO_FD_TYPE_IOREGIONFD = 1 VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW = 2 +# enum vfu_migr_state_t +VFU_MIGR_STATE_STOP = 0 +VFU_MIGR_STATE_RUNNING = 1 +VFU_MIGR_STATE_STOP_AND_COPY = 2 +VFU_MIGR_STATE_PRE_COPY = 3 +VFU_MIGR_STATE_RESUME = 4 + # enum vfu_dev_irq_type VFU_DEV_INTX_IRQ = 0 @@ -244,7 +267,7 @@ def is_32bit(): VFU_CAP_FLAG_CALLBACK = (1 << 1) VFU_CAP_FLAG_READONLY = (1 << 2) -VFU_MIGR_CALLBACKS_VERS = 1 +VFU_MIGR_CALLBACKS_VERS = 2 SOCK_PATH = b"/tmp/vfio-user.sock.%d" % os.getpid() @@ -528,14 +551,6 @@ def __copy__(self): return result -class vfio_user_dirty_pages(Structure): - _pack_ = 1 - _fields_ = [ - ("argsz", c.c_uint32), - ("flags", c.c_uint32) - ] - - class vfio_user_bitmap(Structure): _pack_ = 1 _fields_ = [ @@ -554,24 +569,73 @@ class vfio_user_bitmap_range(Structure): transition_cb_t = c.CFUNCTYPE(c.c_int, c.c_void_p, c.c_int, use_errno=True) -get_pending_bytes_cb_t = c.CFUNCTYPE(c.c_uint64, c.c_void_p) -prepare_data_cb_t = c.CFUNCTYPE(c.c_void_p, c.POINTER(c.c_uint64), - c.POINTER(c.c_uint64)) -read_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_void_p, - c.c_uint64, c.c_uint64) -write_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_uint64) -data_written_cb_t = c.CFUNCTYPE(c.c_int, c.c_void_p, c.c_uint64) +read_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_void_p, c.c_uint64) +write_data_cb_t = c.CFUNCTYPE(c.c_ssize_t, c.c_void_p, c.c_void_p, c.c_uint64) class vfu_migration_callbacks_t(Structure): _fields_ = [ ("version", c.c_int), ("transition", transition_cb_t), - ("get_pending_bytes", get_pending_bytes_cb_t), - ("prepare_data", prepare_data_cb_t), ("read_data", read_data_cb_t), ("write_data", write_data_cb_t), - ("data_written", data_written_cb_t), + ] + + +class vfio_user_device_feature(Structure): + _pack_ = 1 + _fields_ = [ + ("argsz", c.c_uint32), + ("flags", c.c_uint32) + ] + + +class vfio_user_device_feature_migration(Structure): + _pack_ = 1 + _fields_ = [ + ("flags", c.c_uint64) + ] + + +class vfio_user_device_feature_mig_state(Structure): + _pack_ = 1 + _fields_ = [ + ("device_state", c.c_uint32), + ("data_fd", c.c_uint32), + ] + + +class vfio_user_device_feature_dma_logging_control(Structure): + _pack_ = 1 + _fields_ = [ + ("page_size", c.c_uint64), + ("num_ranges", c.c_uint32), + ("reserved", c.c_uint32), + ] + + +class vfio_user_device_feature_dma_logging_range(Structure): + _pack_ = 1 + _fields_ = [ + ("iova", c.c_uint64), + ("length", c.c_uint64), + ] + + +class vfio_user_device_feature_dma_logging_report(Structure): + _pack_ = 1 + _fields_ = [ + ("iova", c.c_uint64), + ("length", c.c_uint64), + ("page_size", c.c_uint64) + ] + + +class vfio_user_mig_data(Structure): + _pack_ = 1 + _fields_ = [ + ("argsz", c.c_uint32), + ("size", c.c_uint32) ] @@ -590,17 +654,6 @@ def __str__(self): hex(self.offset), self.writeable) -class vfio_user_migration_info(Structure): - _pack_ = 1 - _fields_ = [ - ("device_state", c.c_uint32), - ("reserved", c.c_uint32), - ("pending_bytes", c.c_uint64), - ("data_offset", c.c_uint64), - ("data_size", c.c_uint64), - ] - - # # Util functions # @@ -644,7 +697,7 @@ class vfio_user_migration_info(Structure): lib.vfu_setup_device_dma.argtypes = (c.c_void_p, vfu_dma_register_cb_t, vfu_dma_unregister_cb_t) lib.vfu_setup_device_migration_callbacks.argtypes = (c.c_void_p, - c.POINTER(vfu_migration_callbacks_t), c.c_uint64) + c.POINTER(vfu_migration_callbacks_t)) lib.dma_sg_size.restype = (c.c_size_t) lib.vfu_addr_to_sgl.argtypes = (c.c_void_p, c.c_void_p, c.c_size_t, c.POINTER(dma_sg_t), c.c_size_t, c.c_int) @@ -1019,18 +1072,6 @@ def prepare_ctx_for_dma(dma_register=__dma_register, ret = vfu_setup_device_reset_cb(ctx, reset) assert ret == 0 - f = tempfile.TemporaryFile() - migr_region_size = 2 << PAGE_SHIFT - f.truncate(migr_region_size) - - mmap_areas = [(PAGE_SIZE, PAGE_SIZE)] - - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, - size=migr_region_size, - flags=VFU_REGION_FLAG_RW, mmap_areas=mmap_areas, - fd=f.fileno()) - assert ret == 0 - if migration_callbacks: ret = vfu_setup_device_migration_callbacks(ctx) assert ret == 0 @@ -1040,6 +1081,18 @@ def prepare_ctx_for_dma(dma_register=__dma_register, return ctx + +def transition_to_state(ctx, sock, state, expect=0, rsp=True, busy=False): + feature = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()) + + len(vfio_user_device_feature_mig_state()), + flags=VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE + ) + payload = vfio_user_device_feature_mig_state(device_state=state) + msg(ctx, sock, VFIO_USER_DEVICE_FEATURE, bytes(feature) + bytes(payload), + expect=expect, rsp=rsp, busy=busy) + + # # Library wrappers # @@ -1235,24 +1288,6 @@ def __migr_trans_cb(ctx, state): return migr_trans_cb(ctx, state) -def migr_get_pending_bytes_cb(ctx): - pass - - -@get_pending_bytes_cb_t -def __migr_get_pending_bytes_cb(ctx): - return migr_get_pending_bytes_cb(ctx) - - -def migr_prepare_data_cb(ctx, offset, size): - pass - - -@prepare_data_cb_t -def __migr_prepare_data_cb(ctx, offset, size): - return migr_prepare_data_cb(ctx, offset, size) - - def migr_read_data_cb(ctx, buf, count, offset): pass @@ -1271,29 +1306,17 @@ def __migr_write_data_cb(ctx, buf, count, offset): return migr_write_data_cb(ctx, buf, count, offset) -def migr_data_written_cb(ctx, count): - pass - - -@data_written_cb_t -def __migr_data_written_cb(ctx, count): - return migr_data_written_cb(ctx, count) - - -def vfu_setup_device_migration_callbacks(ctx, cbs=None, offset=PAGE_SIZE): +def vfu_setup_device_migration_callbacks(ctx, cbs=None): assert ctx is not None if not cbs: cbs = vfu_migration_callbacks_t() cbs.version = VFU_MIGR_CALLBACKS_VERS cbs.transition = __migr_trans_cb - cbs.get_pending_bytes = __migr_get_pending_bytes_cb - cbs.prepare_data = __migr_prepare_data_cb cbs.read_data = __migr_read_data_cb cbs.write_data = __migr_write_data_cb - cbs.data_written = __migr_data_written_cb - return lib.vfu_setup_device_migration_callbacks(ctx, cbs, offset) + return lib.vfu_setup_device_migration_callbacks(ctx, cbs) def dma_sg_size(): @@ -1355,4 +1378,30 @@ def fds_are_same(fd1: int, fd2: int) -> bool: return s1.st_dev == s2.st_dev and s1.st_ino == s2.st_ino +def get_bitmap_size(size: int, pgsize: int) -> int: + """ + Returns the size, in bytes, of the bitmap that represents the given range + with the given page size. + """ + + nr_pages = (size // pgsize) + (1 if size % pgsize != 0 else 0) + return ((nr_pages + 63) & ~63) // 8 + + +get_errno_loc = libc.__errno_location +get_errno_loc.restype = c.POINTER(c.c_int) + + +def set_real_errno(errno: int): + """ + ctypes's errno is an internal value that only updates the real value when + the foreign function call returns. In callbacks, however, this doesn't + happen, so `c.set_errno` doesn't propagate in time. In this case we need to + manually set the real errno. + """ + + c.set_errno(errno) # set internal errno so `c.get_errno` gives right value + get_errno_loc()[0] = errno # set real errno + + # ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: # diff --git a/test/py/test_device_get_region_info.py b/test/py/test_device_get_region_info.py index f847cb47..3b7c32da 100644 --- a/test/py/test_device_get_region_info.py +++ b/test/py/test_device_get_region_info.py @@ -78,14 +78,6 @@ def test_device_get_region_info_setup(): mmap_areas=mmap_areas, fd=f.fileno(), offset=0x0) assert ret == 0 - f = tempfile.TemporaryFile() - f.truncate(migr_region_size) - - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, - size=migr_region_size, flags=VFU_REGION_FLAG_RW, - mmap_areas=migr_mmap_areas, fd=f.fileno()) - assert ret == 0 - ret = vfu_realize_ctx(ctx) assert ret == 0 @@ -207,44 +199,6 @@ def test_device_get_region_info_caps(): client.disconnect(ctx) -def test_device_get_region_info_migr(): - global client - - client = connect_client(ctx) - - payload = vfio_region_info(argsz=80, flags=0, - index=VFU_PCI_DEV_MIGR_REGION_IDX, cap_offset=0, - size=0, offset=0) - payload = bytes(payload) + b'\0' * (80 - 32) - - result = msg(ctx, client.sock, VFIO_USER_DEVICE_GET_REGION_INFO, payload) - - info, result = vfio_region_info.pop_from_buffer(result) - mcap, result = vfio_region_info_cap_type.pop_from_buffer(result) - cap, result = vfio_region_info_cap_sparse_mmap.pop_from_buffer(result) - area, result = vfio_region_sparse_mmap_area.pop_from_buffer(result) - - assert info.argsz == 80 - assert info.cap_offset == 32 - - assert mcap.id == VFIO_REGION_INFO_CAP_TYPE - assert mcap.version == 1 - assert mcap.next == 48 - assert mcap.type == VFIO_REGION_TYPE_MIGRATION - assert mcap.subtype == VFIO_REGION_SUBTYPE_MIGRATION - - assert cap.id == VFIO_REGION_INFO_CAP_SPARSE_MMAP - assert cap.version == 1 - assert cap.next == 0 - assert cap.nr_areas == len(migr_mmap_areas) == 1 - - assert area.offset == migr_mmap_areas[0][0] - assert area.size == migr_mmap_areas[0][1] - - # skip reading the SCM_RIGHTS - client.disconnect(ctx) - - def test_device_get_region_info_cleanup(): vfu_destroy_ctx(ctx) diff --git a/test/py/test_device_get_region_info_zero_size.py b/test/py/test_device_get_region_info_zero_size.py index 146e812f..a5691912 100644 --- a/test/py/test_device_get_region_info_zero_size.py +++ b/test/py/test_device_get_region_info_zero_size.py @@ -52,27 +52,26 @@ def test_device_get_region_info_zero_sized_region(): global client - for index in [VFU_PCI_DEV_BAR1_REGION_IDX, VFU_PCI_DEV_MIGR_REGION_IDX]: - payload = vfio_region_info(argsz=argsz, flags=0, - index=index, cap_offset=0, - size=0, offset=0) - - hdr = vfio_user_header(VFIO_USER_DEVICE_GET_REGION_INFO, - size=len(payload)) - client.sock.send(hdr + payload) - vfu_run_ctx(ctx) - result = get_reply(client.sock) - - assert len(result) == argsz - - info, _ = vfio_region_info.pop_from_buffer(result) - - assert info.argsz == argsz - assert info.flags == 0 - assert info.index == index - assert info.cap_offset == 0 - assert info.size == 0 - assert info.offset == 0 + payload = vfio_region_info(argsz=argsz, flags=0, + index=VFU_PCI_DEV_BAR1_REGION_IDX, cap_offset=0, + size=0, offset=0) + + hdr = vfio_user_header(VFIO_USER_DEVICE_GET_REGION_INFO, + size=len(payload)) + client.sock.send(hdr + payload) + vfu_run_ctx(ctx) + result = get_reply(client.sock) + + assert len(result) == argsz + + info, _ = vfio_region_info.pop_from_buffer(result) + + assert info.argsz == argsz + assert info.flags == 0 + assert info.index == VFU_PCI_DEV_BAR1_REGION_IDX + assert info.cap_offset == 0 + assert info.size == 0 + assert info.offset == 0 vfu_destroy_ctx(ctx) diff --git a/test/py/test_dirty_pages.py b/test/py/test_dirty_pages.py index f3e42198..5ff0f843 100644 --- a/test/py/test_dirty_pages.py +++ b/test/py/test_dirty_pages.py @@ -34,6 +34,7 @@ import tempfile ctx = None +client = None quiesce_errno = 0 @@ -69,16 +70,6 @@ def test_dirty_pages_setup(): ret = vfu_setup_device_dma(ctx, dma_register, dma_unregister) assert ret == 0 - f = tempfile.TemporaryFile() - f.truncate(2 << PAGE_SHIFT) - - mmap_areas = [(PAGE_SIZE, PAGE_SIZE)] - - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, - size=2 << PAGE_SHIFT, flags=VFU_REGION_FLAG_RW, - mmap_areas=mmap_areas, fd=f.fileno()) - assert ret == 0 - ret = vfu_realize_ctx(ctx) assert ret == 0 @@ -100,59 +91,51 @@ def test_dirty_pages_setup(): msg(ctx, client.sock, VFIO_USER_DMA_MAP, payload) -def test_dirty_pages_short_write(): - payload = struct.pack("I", 8) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) - - -def test_dirty_pages_bad_argsz(): - payload = vfio_user_dirty_pages(argsz=4, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) - - -def test_dirty_pages_start_no_migration(): - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.ENOTSUP) +def test_setup_migration(): + ret = vfu_setup_device_migration_callbacks(ctx) + assert ret == 0 -def test_setup_migr_region(): - ret = vfu_setup_device_migration_callbacks(ctx, offset=PAGE_SIZE) - assert ret == 0 +def start_logging(addr=None, length=None, page_size=PAGE_SIZE, expect=0): + """ + Start logging dirty writes. + If a region and page size are specified, they will be sent to the server to + start logging. Otherwise, all regions will be logged and the default page + size will be used. -def test_dirty_pages_start_bad_flags(): - # - # This is a little cheeky, after vfu_realize_ctx(), but it works at the - # moment. - # - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=(VFIO_IOMMU_DIRTY_PAGES_FLAG_START | - VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP)) + Note: in the current implementation, all regions are logged whether or not + you specify a region, as the additional constraint of only logging a + certain region is considered an optimisation and is not yet implemented. + """ - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) + if addr is not None: + ranges = vfio_user_device_feature_dma_logging_range( + iova=addr, + length=length + ) + num_ranges = 1 + else: + ranges = bytearray() + num_ranges = 0 - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=(VFIO_IOMMU_DIRTY_PAGES_FLAG_START | - VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)) + feature = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()) + + len(vfio_user_device_feature_dma_logging_control()) + + len(ranges), + flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_START | VFIO_DEVICE_FEATURE_SET) - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) + payload = vfio_user_device_feature_dma_logging_control( + page_size=page_size, + num_ranges=num_ranges, + reserved=0) + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, + bytes(feature) + bytes(payload) + bytes(ranges), expect=expect) -def start_logging(): - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START) - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload) +def test_dirty_pages_start_zero_pgsize(): + start_logging(page_size=0, expect=errno.EINVAL) def test_dirty_pages_start(): @@ -161,157 +144,65 @@ def test_dirty_pages_start(): start_logging() -def test_dirty_pages_get_short_read(): - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) - - -# -# This should in fact work; update when it does. -# -def test_dirty_pages_get_sub_range(): - argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 - dirty_pages = vfio_user_dirty_pages(argsz=argsz, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8) - br = vfio_user_bitmap_range(iova=0x11 << PAGE_SHIFT, size=PAGE_SIZE, - bitmap=bitmap) - - payload = bytes(dirty_pages) + bytes(br) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.ENOTSUP) +def test_dirty_pages_start_different_pgsize(): + """ + Once we've started logging with page size PAGE_SIZE, any request to start + logging at a different page size should be rejected. + """ + start_logging(page_size=PAGE_SIZE >> 1, expect=errno.EINVAL) + start_logging(page_size=PAGE_SIZE << 1, expect=errno.EINVAL) -def test_dirty_pages_get_bad_page_size(): - argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 - dirty_pages = vfio_user_dirty_pages(argsz=argsz, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=2 << PAGE_SHIFT, size=8) - br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT, - size=0x10 << PAGE_SHIFT, bitmap=bitmap) - payload = bytes(dirty_pages) + bytes(br) +def get_dirty_page_bitmap(addr=0x10 << PAGE_SHIFT, length=0x10 << PAGE_SHIFT, + page_size=PAGE_SIZE, expect=0): + """ + Get the dirty page bitmap from the server for the given region and page + size as a 64-bit integer. This function only works for bitmaps that fit + within a 64-bit integer because that's what it returns. + """ - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) + bitmap_size = get_bitmap_size(length, page_size) + assert bitmap_size == 8 -def test_dirty_pages_get_bad_bitmap_size(): - argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 - dirty_pages = vfio_user_dirty_pages(argsz=argsz, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=1) - br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT, - size=0x10 << PAGE_SHIFT, bitmap=bitmap) + argsz = len(vfio_user_device_feature()) + \ + len(vfio_user_device_feature_dma_logging_report()) + \ + bitmap_size - payload = bytes(dirty_pages) + bytes(br) + feature = vfio_user_device_feature( + argsz=argsz, + flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT | VFIO_DEVICE_FEATURE_GET + ) - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) + report = vfio_user_device_feature_dma_logging_report( + iova=addr, + length=length, + page_size=page_size + ) + payload = bytes(feature) + bytes(report) -def test_dirty_pages_get_bad_argsz(): - dirty_pages = vfio_user_dirty_pages(argsz=SERVER_MAX_DATA_XFER_SIZE + 8, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, - size=SERVER_MAX_DATA_XFER_SIZE + 8) - br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT, - size=0x10 << PAGE_SHIFT, bitmap=bitmap) + result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload, + expect=expect) - payload = bytes(dirty_pages) + bytes(br) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) - - -def test_dirty_pages_get_short_reply(): - dirty_pages = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8) - br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT, - size=0x10 << PAGE_SHIFT, bitmap=bitmap) - - payload = bytes(dirty_pages) + bytes(br) - - result = msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload) - - assert len(result) == len(vfio_user_dirty_pages()) - - dirty_pages, _ = vfio_user_dirty_pages.pop_from_buffer(result) - - argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 - - assert dirty_pages.argsz == argsz - assert dirty_pages.flags == VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP - - -def test_get_dirty_page_bitmap_unmapped(): - argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 - - dirty_pages = vfio_user_dirty_pages(argsz=argsz, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8) - br = vfio_user_bitmap_range(iova=0x40 << PAGE_SHIFT, - size=0x10 << PAGE_SHIFT, bitmap=bitmap) - - payload = bytes(dirty_pages) + bytes(br) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, - expect=errno.EINVAL) - - -def test_dirty_pages_get_unmodified(): - argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 - - dirty_pages = vfio_user_dirty_pages(argsz=argsz, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8) - br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT, - size=0x10 << PAGE_SHIFT, bitmap=bitmap) - - payload = bytes(dirty_pages) + bytes(br) - - result = msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload) + if expect != 0: + return assert len(result) == argsz - dirty_pages, result = vfio_user_dirty_pages.pop_from_buffer(result) - - assert dirty_pages.argsz == argsz - assert dirty_pages.flags == VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP - - br, result = vfio_user_bitmap_range.pop_from_buffer(result) - - assert br.iova == 0x10 << PAGE_SHIFT - assert br.size == 0x10 << PAGE_SHIFT + _, result = vfio_user_device_feature.pop_from_buffer(result) + _, result = \ + vfio_user_device_feature_dma_logging_report.pop_from_buffer(result) - assert br.bitmap.pgsize == PAGE_SIZE - assert br.bitmap.size == 8 + assert len(result) == bitmap_size + return struct.unpack("Q", result)[0] -def get_dirty_page_bitmap(): - argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8 - - dirty_pages = vfio_user_dirty_pages(argsz=argsz, - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8) - br = vfio_user_bitmap_range(iova=0x10 << PAGE_SHIFT, - size=0x10 << PAGE_SHIFT, bitmap=bitmap) - - payload = bytes(dirty_pages) + bytes(br) - - result = msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload) - - _, result = vfio_user_dirty_pages.pop_from_buffer(result) - _, result = vfio_user_bitmap_range.pop_from_buffer(result) - - assert len(result) == 8 - return struct.unpack("Q", result)[0] +def test_dirty_pages_get_unmodified(): + bitmap = get_dirty_page_bitmap() + assert bitmap == 0 sg3 = None @@ -374,6 +265,27 @@ def test_dirty_pages_get_modified(): bitmap = get_dirty_page_bitmap() assert bitmap == 0b0000001111000001 + # check dirty bitmap is correctly extended when we give a smaller page size + vfu_sgl_put(ctx, sg1, iovec1) + vfu_sgl_put(ctx, sg4, iovec4) + bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE >> 1) + assert bitmap == 0b00000000000011111111000000000011 + + # check dirty bitmap is correctly shortened when we give a larger page size + vfu_sgl_put(ctx, sg1, iovec1) + vfu_sgl_put(ctx, sg4, iovec4) + bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE << 1) + assert bitmap == 0b00011001 + + # check dirty bitmap is correctly shortened when we give a page size that + # is so large that one bit corresponds to multiple bytes in the raw bitmap + vfu_sgl_put(ctx, sg1, iovec1) + vfu_sgl_put(ctx, sg4, iovec4) + bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE << 4) + assert bitmap == 0b1 + bitmap = get_dirty_page_bitmap(page_size=PAGE_SIZE << 4) + assert bitmap == 0b0 + # after another two puts, should just be one dirty page vfu_sgl_put(ctx, sg2, iovec2) vfu_sgl_put(ctx, sg3, iovec3) @@ -427,72 +339,76 @@ def test_dirty_pages_get_modified(): assert bitmap == 0b010000000000000000001100 -def test_dirty_pages_stop(): - # FIXME we have a memory leak as we don't free dirty bitmaps when - # destroying the context. - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload) +def test_dirty_pages_invalid_arguments(): + # Failed to translate + get_dirty_page_bitmap(addr=0xdeadbeef, expect=errno.ENOENT) + # Does not exactly match a region (libvfio-user limitation) + get_dirty_page_bitmap(addr=(0x10 << PAGE_SHIFT) + 1, + length=(0x20 << PAGE_SHIFT) - 1, + expect=errno.ENOTSUP) -def test_dirty_pages_start_with_quiesce(): - global quiesce_errno + # Invalid requested bitmap size + get_dirty_page_bitmap(page_size=1 << 24, expect=errno.EINVAL) - quiesce_errno = errno.EBUSY + # Region not mapped + get_dirty_page_bitmap(addr=0x40 << PAGE_SHIFT, expect=errno.EINVAL) - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START) - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, rsp=False, busy=True) +def stop_logging(addr=None, length=None): + if addr is not None: + ranges = vfio_user_device_feature_dma_logging_range( + iova=addr, + length=length + ) + else: + ranges = [] - ret = vfu_device_quiesced(ctx, 0) - assert ret == 0 + feature = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()) + + len(vfio_user_device_feature_dma_logging_control()) + + len(ranges), + flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP | VFIO_DEVICE_FEATURE_SET) - # now should be able to get the reply - get_reply(client.sock, expect=0) + payload = vfio_user_device_feature_dma_logging_control( + page_size=PAGE_SIZE, + num_ranges=(1 if addr is not None else 0), + reserved=0) - quiesce_errno = 0 + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, + bytes(feature) + bytes(payload) + bytes(ranges)) -def test_dirty_pages_bitmap_with_quiesce(): - global quiesce_errno - - quiesce_errno = errno.EBUSY +def test_dirty_pages_stop(): + stop_logging() - ret, sg1 = vfu_addr_to_sgl(ctx, dma_addr=0x10 << PAGE_SHIFT, - length=PAGE_SIZE) - assert ret == 1 - iovec1 = iovec_t() - ret = vfu_sgl_get(ctx, sg1, iovec1) - assert ret == 0 - vfu_sgl_put(ctx, sg1, iovec1) - bitmap = get_dirty_page_bitmap() - assert bitmap == 0b0000000000000001 +def test_dirty_pages_cleanup(): + client.disconnect(ctx) + vfu_destroy_ctx(ctx) -def test_dirty_pages_stop_with_quiesce(): - global quiesce_errno +def test_dirty_pages_uninitialised_dma(): + global ctx, client - quiesce_errno = errno.EBUSY + ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB) + assert ctx is not None - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) + ret = vfu_pci_init(ctx) + assert ret == 0 - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload, rsp=False, busy=True) + vfu_setup_device_quiesce_cb(ctx, quiesce_cb=quiesce_cb) - ret = vfu_device_quiesced(ctx, 0) + ret = vfu_realize_ctx(ctx) assert ret == 0 - # now should be able to get the reply - get_reply(client.sock, expect=0) - - quiesce_errno = 0 + client = connect_client(ctx) + start_logging(expect=errno.EINVAL) + get_dirty_page_bitmap(expect=errno.EINVAL) -def test_dirty_pages_cleanup(): client.disconnect(ctx) + vfu_destroy_ctx(ctx) # ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: diff --git a/test/py/test_dma_unmap.py b/test/py/test_dma_unmap.py index a1fa94b3..b21e072c 100644 --- a/test/py/test_dma_unmap.py +++ b/test/py/test_dma_unmap.py @@ -113,26 +113,6 @@ def test_dma_unmap_dirty_not_tracking(): expect=errno.EINVAL) -def test_dma_unmap_dirty_not_mapped(): - - setup_dma_regions([(PAGE_SIZE, PAGE_SIZE)]) - vfu_setup_device_migration_callbacks(ctx, offset=PAGE_SIZE) - payload = vfio_user_dirty_pages(argsz=len(vfio_user_dirty_pages()), - flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_START) - - msg(ctx, client.sock, VFIO_USER_DIRTY_PAGES, payload) - - argsz = len(vfio_user_dma_unmap()) + len(vfio_user_bitmap()) + 8 - unmap = vfio_user_dma_unmap(argsz=argsz, - flags=VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP, addr=PAGE_SIZE, - size=PAGE_SIZE) - bitmap = vfio_user_bitmap(pgsize=PAGE_SIZE, size=8) - payload = bytes(unmap) + bytes(bitmap) + bytes(8) - - msg(ctx, client.sock, VFIO_USER_DMA_UNMAP, payload, - expect=errno.EINVAL) - - def test_dma_unmap_invalid_flags(): setup_dma_regions() diff --git a/test/py/test_migration.py b/test/py/test_migration.py index a6327d82..d423119e 100644 --- a/test/py/test_migration.py +++ b/test/py/test_migration.py @@ -1,7 +1,8 @@ # -# Copyright (c) 2021 Nutanix Inc. All rights reserved. +# Copyright (c) 2023 Nutanix Inc. All rights reserved. # # Authors: Thanos Makatos +# William Henderson # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -28,25 +29,143 @@ # from libvfio_user import * -import ctypes as c +from collections import deque +import ctypes import errno -from unittest.mock import patch ctx = None client = None +current_state = None # the current migration state on the server +path = [] # the server transition path (each transition appends the new state) -def setup_function(function): +read_data = None +write_data = None +callbacks_errno = 0 + + +STATES = { + VFIO_USER_DEVICE_STATE_STOP, + VFIO_USER_DEVICE_STATE_RUNNING, + VFIO_USER_DEVICE_STATE_STOP_COPY, + VFIO_USER_DEVICE_STATE_RESUMING, + VFIO_USER_DEVICE_STATE_PRE_COPY +} + + +UNREACHABLE_STATES = { + VFIO_USER_DEVICE_STATE_ERROR, + VFIO_USER_DEVICE_STATE_PRE_COPY_P2P, + VFIO_USER_DEVICE_STATE_RUNNING_P2P +} + + +VFU_TO_VFIO_MIGR_STATE = { + VFU_MIGR_STATE_STOP: VFIO_USER_DEVICE_STATE_STOP, + VFU_MIGR_STATE_RUNNING: VFIO_USER_DEVICE_STATE_RUNNING, + VFU_MIGR_STATE_STOP_AND_COPY: VFIO_USER_DEVICE_STATE_STOP_COPY, + VFU_MIGR_STATE_RESUME: VFIO_USER_DEVICE_STATE_RESUMING, + VFU_MIGR_STATE_PRE_COPY: VFIO_USER_DEVICE_STATE_PRE_COPY +} + + +# Set a very small maximum transfer size for later tests. +MAX_DATA_XFER_SIZE = 4 + + +@transition_cb_t +def migr_trans_cb(_ctx, state): + global current_state, path + + if callbacks_errno != 0: + set_real_errno(callbacks_errno) + return -1 + + if state in VFU_TO_VFIO_MIGR_STATE: + state = VFU_TO_VFIO_MIGR_STATE[state] + else: + assert False + + current_state = state + + path.append(state) + + return 0 + + +@read_data_cb_t +def migr_read_data_cb(_ctx, buf, count): + global read_data + + if callbacks_errno != 0: + set_real_errno(callbacks_errno) + return -1 + + length = min(count, len(read_data)) + ctypes.memmove(buf, read_data, length) + read_data = None + + return length + + +@write_data_cb_t +def migr_write_data_cb(_ctx, buf, count): + global write_data + + if callbacks_errno != 0: + set_real_errno(callbacks_errno) + return -1 + + write_data = bytes(count) + ctypes.memmove(write_data, buf, count) + + return count + + +def setup_fail_callbacks(errno): + global callbacks_errno + callbacks_errno = errno + + +def teardown_fail_callbacks(): + global callbacks_errno + callbacks_errno = 0 + c.set_errno(0) + + +def teardown_function(function): + teardown_fail_callbacks() + + +def transition_to_migr_state(state, expect=0, rsp=True, busy=False): + return transition_to_state(ctx, client.sock, state, expect, rsp, busy) + + +def mig_data_payload(data): + argsz = len(vfio_user_mig_data()) + len(data) + return vfio_user_mig_data( + argsz=argsz, + size=len(data) + ) + + +def test_migration_setup(): global ctx, client ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB) assert ctx is not None - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, - size=2 << PAGE_SHIFT, flags=VFU_REGION_FLAG_RW) - assert ret == 0 + cbs = vfu_migration_callbacks_t() + cbs.version = 1 # old callbacks version + cbs.transition = migr_trans_cb + cbs.read_data = migr_read_data_cb + cbs.write_data = migr_write_data_cb - ret = vfu_setup_device_migration_callbacks(ctx) + ret = vfu_setup_device_migration_callbacks(ctx, cbs) + assert ret < 0, "do not allow old callbacks version" + + cbs.version = VFU_MIGR_CALLBACKS_VERS # new callbacks version + ret = vfu_setup_device_migration_callbacks(ctx, cbs) assert ret == 0 vfu_setup_device_quiesce_cb(ctx) @@ -54,113 +173,409 @@ def setup_function(function): ret = vfu_realize_ctx(ctx) assert ret == 0 - client = connect_client(ctx) - + caps = { + "capabilities": { + "max_data_xfer_size": MAX_DATA_XFER_SIZE, + } + } -def teardown_function(function): - global ctx - vfu_destroy_ctx(ctx) + client = connect_client(ctx, caps) -@patch('libvfio_user.quiesce_cb') -@patch('libvfio_user.migr_trans_cb') -def test_migration_bad_access(mock_trans, mock_quiesce): +def server_transition_track_path(a, b, expectA=0, expectB=0): """ - Tests that attempting to access the migration state register in an - non-aligned manner fails. - - This test is important because we tell whether we need to quiesce by - checking for a register-sized access, otherwise we'll change migration - state without having quiesced. + Carry out the state transition from a to b on the server, keeping track of + and returning the transition path taken. """ - global ctx, client - data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little') - write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0, - count=len(data)-1, data=data, expect=errno.EINVAL) + global path - mock_trans.assert_not_called() + if current_state == VFIO_USER_DEVICE_STATE_STOP_COPY and \ + a == VFIO_USER_DEVICE_STATE_PRE_COPY: + # The transition STOP_COPY -> PRE_COPY is explicitly blocked so we + # advance one state to get around this in order to set up the test. + transition_to_migr_state(VFIO_USER_DEVICE_STATE_STOP) + transition_to_migr_state(a, expect=expectA) -@patch('libvfio_user.quiesce_cb') -@patch('libvfio_user.migr_trans_cb', return_value=0) -def test_migration_trans_sync(mock_trans, mock_quiesce): - """ - Tests transitioning to the saving state. - """ + if expectA != 0: + return None - global ctx, client + path = [] - data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little') - write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0, - count=len(data), data=data) + transition_to_migr_state(b, expect=expectB) - ret = vfu_run_ctx(ctx) - assert ret == 0 + return path.copy() -@patch('libvfio_user.migr_trans_cb', side_effect=fail_with_errno(errno.EPERM)) -def test_migration_trans_sync_err(mock_trans): +def test_migration_shortest_state_transition_paths(): """ - Tests the device returning an error when the migration state is written to. + The spec dictates that complex state transitions are to be implemented as + combinations of the defined direct transitions, with the path selected + according to the following rules: + + - Select the shortest path. + - The path cannot have saving group states as interior arcs, only start/end + states. + + This test implements a breadth-first search to ensure that the paths taken + by the implementation correctly follow these rules. """ - global ctx, client + # allowed direct transitions (edges) + E = { + VFIO_USER_DEVICE_STATE_ERROR: set(), + VFIO_USER_DEVICE_STATE_STOP: { + VFIO_USER_DEVICE_STATE_RUNNING, + VFIO_USER_DEVICE_STATE_STOP_COPY, + VFIO_USER_DEVICE_STATE_RESUMING + }, + VFIO_USER_DEVICE_STATE_RUNNING: { + VFIO_USER_DEVICE_STATE_STOP, + VFIO_USER_DEVICE_STATE_PRE_COPY + }, + VFIO_USER_DEVICE_STATE_STOP_COPY: {VFIO_USER_DEVICE_STATE_STOP}, + VFIO_USER_DEVICE_STATE_RESUMING: {VFIO_USER_DEVICE_STATE_STOP}, + VFIO_USER_DEVICE_STATE_RUNNING_P2P: set(), + VFIO_USER_DEVICE_STATE_PRE_COPY: { + VFIO_USER_DEVICE_STATE_RUNNING, + VFIO_USER_DEVICE_STATE_STOP_COPY + }, + VFIO_USER_DEVICE_STATE_PRE_COPY_P2P: set() + } + + # states (vertices) + V = E.keys() + + # "saving states" which cannot be internal arcs + saving_states = {VFIO_USER_DEVICE_STATE_PRE_COPY, + VFIO_USER_DEVICE_STATE_STOP_COPY} + + # Consider each vertex in turn to be the start state, that is, the state + # we are transitioning from. + for source in V: + # The previous node in the shortest path for each node, e.g. for + # shortest path `source -> node -> target`, `back[node] == source`. + back = {v: None for v in V} + queue = deque([(source, None)]) + + # Use BFS to calculate the shortest path from the start state to every + # other state, following the rule that no intermediate states can be + # saving states. + while len(queue) > 0: + (curr, prev) = queue.popleft() + back[curr] = prev + + # Intermediate states cannot be saving states, so if our current + # node is not the start state and it is a saving state, it is only + # allowed to be an end state so we don't explore its neighbours. + if curr != source and curr in saving_states: + continue + + for nxt in E[curr]: + if back[nxt] is None: + queue.append((nxt, curr)) + + # Iterate over the states + for target in V: + if source == VFIO_USER_DEVICE_STATE_STOP_COPY \ + and target == VFIO_USER_DEVICE_STATE_PRE_COPY: + # test for this transition being blocked in a separate test + continue + + # If BFS found a path to that state, follow the backpointers to + # calculate the path, and check that it's equal to the path taken + # by the server. + if back[target] is not None: + seq = deque([]) + curr = target + while curr != source: + seq.appendleft(curr) + curr = back[curr] + + server_seq = server_transition_track_path(source, target) + + assert len(seq) == len(server_seq) + assert all(seq[i] == server_seq[i] for i in range(len(seq))) + + # If BFS couldn't find a path to that state, check that the server + # doesn't allow that transition either. + else: + # If the start state is an unreachable state, we won't be able + # to transition into it in order to try and calculate a path on + # the server, so we expect that transition to fail. + expectA = errno.EINVAL if source in UNREACHABLE_STATES else 0 + + # No matter what, we expect transitioning to the target state + # to fail. + server_transition_track_path(source, target, expectA=expectA, + expectB=errno.EINVAL) + + +def test_migration_stop_copy_to_pre_copy_rejected(): + transition_to_migr_state(VFIO_USER_DEVICE_STATE_STOP_COPY) + transition_to_migr_state(VFIO_USER_DEVICE_STATE_PRE_COPY, + expect=errno.EINVAL) + + +def test_migration_nonexistent_state(): + transition_to_migr_state(0xabcd, expect=errno.EINVAL) + + +def test_migration_failed_callback(): + setup_fail_callbacks(0xbeef) + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING, expect=0xbeef) + assert c.get_errno() == 0xbeef + teardown_fail_callbacks() + + +def test_migration_get_state(): + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING) + + feature = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()) + + len(vfio_user_device_feature_mig_state()), + flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE + ) + + result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, feature) + _, result = vfio_user_device_feature.pop_from_buffer(result) + state, _ = vfio_user_device_feature_mig_state.pop_from_buffer(result) + assert state.device_state == VFIO_USER_DEVICE_STATE_RUNNING - data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little') - write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0, - count=len(data), data=data, expect=errno.EPERM) - ret = vfu_run_ctx(ctx) - assert ret == 0 +def test_handle_mig_data_read(): + global read_data + + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING) + + data = bytes([0, 1, 2, 3]) + payload = mig_data_payload(data) + + VALID_STATES = {VFIO_USER_DEVICE_STATE_PRE_COPY, + VFIO_USER_DEVICE_STATE_STOP_COPY} + + for state in STATES: + transition_to_migr_state(state) + read_data = data + expect = 0 if state in VALID_STATES else errno.EINVAL + result = msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload, + expect=expect) + + if state in VALID_STATES: + assert len(result) == len(payload) + len(data) + assert result[len(vfio_user_mig_data()):] == data -@patch('libvfio_user.quiesce_cb', side_effect=fail_with_errno(errno.EBUSY)) -@patch('libvfio_user.migr_trans_cb', return_value=0) -def test_migration_trans_async(mock_trans, mock_quiesce): +def test_handle_mig_data_read_too_long(): """ - Tests transitioning to the saving state where the device is initially busy - quiescing. + When we set up the tests at the top of this file we specify that the max + data transfer size is 4 bytes. Here we test to check that a transfer of too + many bytes fails. """ - global ctx, client - mock_quiesce + global read_data - data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little') - write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0, - count=len(data), data=data, rsp=False, - busy=True) + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING) + transition_to_migr_state(VFIO_USER_DEVICE_STATE_PRE_COPY) - ret = vfu_device_quiesced(ctx, 0) - assert ret == 0 + # Create a payload reading with length 1 byte longer than the max. + read_data = bytes([i for i in range(MAX_DATA_XFER_SIZE + 1)]) + payload = mig_data_payload(read_data) - get_reply(client.sock) + msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload, + expect=errno.EINVAL) + + +def test_handle_mig_data_read_failed_callback(): + transition_to_migr_state(VFIO_USER_DEVICE_STATE_PRE_COPY) + + read_data = bytes([1, 2, 3, 4]) + payload = mig_data_payload(read_data) + + setup_fail_callbacks(0xbeef) + + msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload, expect=0xbeef) + assert c.get_errno() == 0xbeef + + +def test_handle_mig_data_read_short_write(): + data = bytes([1, 2, 3, 4]) + payload = bytes(mig_data_payload(data)) + + # don't send the last byte + msg(ctx, client.sock, VFIO_USER_MIG_DATA_READ, payload[:-1], + expect=errno.EINVAL) + + +def test_handle_mig_data_write(): + data = bytes([1, 2, 3, 4]) + payload = mig_data_payload(data) + + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RESUMING) + msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data) + assert write_data == data - ret = vfu_run_ctx(ctx) - assert ret == 0 +def test_handle_mig_data_write_invalid_state(): + data = bytes([1, 2, 3, 4]) + payload = mig_data_payload(data) -@patch('libvfio_user.quiesce_cb', side_effect=fail_with_errno(errno.EBUSY)) -@patch('libvfio_user.migr_trans_cb', side_effect=fail_with_errno(errno.ENOTTY)) -def test_migration_trans_async_err(mock_trans, mock_quiesce): + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RUNNING) + msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data, + expect=errno.EINVAL) + + +def test_handle_mig_data_write_too_long(): """ - Tests writing to the migration state register, the device not being able to - immediately quiesce, and then finally the device failing to transition to - the new migration state. + When we set up the tests at the top of this file we specify that the max + data transfer size is 4 bytes. Here we test to check that a transfer of too + many bytes fails. """ - global ctx, client + # Create a payload writing with length 1 byte longer than the max. + data = bytes([i for i in range(MAX_DATA_XFER_SIZE + 1)]) + payload = mig_data_payload(data) - data = VFIO_DEVICE_STATE_V1_RUNNING.to_bytes(c.sizeof(c.c_int), 'little') - write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0, - count=len(data), data=data, rsp=False, - busy=True) + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RESUMING) + msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data, + expect=errno.EINVAL) - ret = vfu_device_quiesced(ctx, 0) - assert ret == 0 - print("waiting for reply") - get_reply(client.sock, errno.ENOTTY) - print("received reply") +def test_handle_mig_data_write_failed_callback(): + transition_to_migr_state(VFIO_USER_DEVICE_STATE_RESUMING) + + data = bytes([1, 2, 3, 4]) + payload = mig_data_payload(data) + + setup_fail_callbacks(0xbeef) + + msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, bytes(payload) + data, + expect=0xbeef) + assert c.get_errno() == 0xbeef + + +def test_handle_mig_data_write_short_write(): + data = bytes([1, 2, 3, 4]) + payload = mig_data_payload(data) + + msg(ctx, client.sock, VFIO_USER_MIG_DATA_WRITE, payload, + expect=errno.EINVAL) + + +def test_device_feature_migration_get(): + payload = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()) + + len(vfio_user_device_feature_migration()), + flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION + ) + + result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload) + _, result = vfio_user_device_feature.pop_from_buffer(result) + flags, _ = vfio_user_device_feature_migration.pop_from_buffer(result) + flags = flags.flags + + assert flags == VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY + + +def test_device_feature_short_write(): + payload = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()) + + len(vfio_user_device_feature_migration()), + flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION + ) + + payload = bytes(payload) + + # don't send the last byte + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload[:-1], + expect=errno.EINVAL) + + +def test_device_feature_unsupported_operation(): + payload = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()) + + len(vfio_user_device_feature_migration()), + flags=VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIGRATION + ) + + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload, + expect=errno.EINVAL) + + +def test_device_feature_bad_argsz_probe(): + payload = vfio_user_device_feature( + argsz=2, + flags=VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_MIGRATION + ) + + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload, + expect=errno.EINVAL) + + +def test_device_feature_bad_argsz_get_migration(): + payload = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()), + flags=VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION + ) + + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload, + expect=errno.EINVAL) + + +def test_device_feature_bad_argsz_get_dma(): + argsz = len(vfio_user_device_feature()) + \ + len(vfio_user_device_feature_dma_logging_report()) + \ + get_bitmap_size(0x20 << PAGE_SHIFT, PAGE_SIZE) + + feature = vfio_user_device_feature( + argsz=argsz - 1, # not big enough + flags=VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT | VFIO_DEVICE_FEATURE_GET + ) + + report = vfio_user_device_feature_dma_logging_report( + iova=0x10 << PAGE_SHIFT, + length=0x20 << PAGE_SHIFT, + page_size=PAGE_SIZE + ) + + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, bytes(feature) + + bytes(report), expect=errno.EINVAL) + + +def test_device_feature_bad_argsz_set(): + feature = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()), # no space for state data + flags=VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE + ) + payload = vfio_user_device_feature_mig_state( + device_state=VFIO_USER_DEVICE_STATE_RUNNING + ) + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, bytes(feature) + + bytes(payload), expect=errno.EINVAL) + + +def test_device_feature_probe(): + payload = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()), + flags=VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_MIGRATION + ) + + result = msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload) + assert bytes(payload) == result + + payload = vfio_user_device_feature( + argsz=len(vfio_user_device_feature()), + flags=VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_SET | + VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION + ) + + msg(ctx, client.sock, VFIO_USER_DEVICE_FEATURE, payload, + expect=errno.EINVAL) + + +def test_migration_cleanup(): + client.disconnect(ctx) + vfu_destroy_ctx(ctx) # ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: # diff --git a/test/py/test_quiesce.py b/test/py/test_quiesce.py index 3f728270..3e1dbca3 100644 --- a/test/py/test_quiesce.py +++ b/test/py/test_quiesce.py @@ -31,9 +31,10 @@ import errno from unittest import mock from unittest.mock import patch - +import tempfile ctx = None +client = None def setup_function(function): @@ -197,32 +198,28 @@ def test_allowed_funcs_in_quiesced_dma_unregister_busy(mock_quiesce, @patch('libvfio_user.migr_trans_cb', side_effect=_side_effect) @patch('libvfio_user.quiesce_cb') -def test_allowed_funcs_in_quiesed_migration(mock_quiesce, +def test_allowed_funcs_in_quiesced_migration(mock_quiesce, mock_trans): global ctx, client _map_dma_region(ctx, client.sock) - data = VFIO_DEVICE_STATE_V1_SAVING.to_bytes(c.sizeof(c.c_int), 'little') - write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0, - count=len(data), data=data) - mock_trans.assert_called_once_with(ctx, VFIO_DEVICE_STATE_V1_SAVING) + transition_to_state(ctx, client.sock, VFIO_USER_DEVICE_STATE_STOP) + mock_trans.assert_called_once_with(ctx, VFU_MIGR_STATE_STOP) @patch('libvfio_user.migr_trans_cb', side_effect=_side_effect) @patch('libvfio_user.quiesce_cb') -def test_allowed_funcs_in_quiesed_migration_busy(mock_quiesce, +def test_allowed_funcs_in_quiesced_migration_busy(mock_quiesce, mock_trans): global ctx, client _map_dma_region(ctx, client.sock) mock_quiesce.side_effect = fail_with_errno(errno.EBUSY) - data = VFIO_DEVICE_STATE_V1_STOP.to_bytes(c.sizeof(c.c_int), 'little') - write_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, offset=0, - count=len(data), data=data, rsp=False, - busy=True) + transition_to_state(ctx, client.sock, VFIO_USER_DEVICE_STATE_STOP, + busy=True) ret = vfu_device_quiesced(ctx, 0) assert ret == 0 - mock_trans.assert_called_once_with(ctx, VFIO_DEVICE_STATE_V1_STOP) + mock_trans.assert_called_once_with(ctx, VFU_MIGR_STATE_STOP) @patch('libvfio_user.reset_cb', side_effect=_side_effect) diff --git a/test/py/test_request_errors.py b/test/py/test_request_errors.py index c25a7158..1f89e910 100644 --- a/test/py/test_request_errors.py +++ b/test/py/test_request_errors.py @@ -54,10 +54,6 @@ def setup_function(function): ret = vfu_setup_device_reset_cb(ctx) assert ret == 0 - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, - size=2 << PAGE_SHIFT, flags=VFU_REGION_FLAG_RW) - assert ret == 0 - ret = vfu_setup_device_migration_callbacks(ctx) assert ret == 0 @@ -189,24 +185,21 @@ def test_disconnected_socket_quiesce_busy(mock_quiesce): @patch('libvfio_user.reset_cb') @patch('libvfio_user.quiesce_cb', side_effect=fail_with_errno(errno.EBUSY)) -@patch('libvfio_user.migr_get_pending_bytes_cb') -def test_reply_fail_quiesce_busy(mock_get_pending_bytes, mock_quiesce, +@patch('libvfio_user.migr_trans_cb') +def test_reply_fail_quiesce_busy(mock_migr_trans_cb, mock_quiesce, mock_reset): """Tests failing to reply and the quiesce callback returning EBUSY.""" global ctx, client - def get_pending_bytes_side_effect(ctx): + def migr_trans_cb_side_effect(ctx, state): client.sock.close() return 0 - mock_get_pending_bytes.side_effect = get_pending_bytes_side_effect - - # read the get_pending_bytes register, it should close the socket causing - # the reply to fail - read_region(ctx, client.sock, VFU_PCI_DEV_MIGR_REGION_IDX, - vfio_user_migration_info.pending_bytes.offset, - vfio_user_migration_info.pending_bytes.size, rsp=False, - busy=True) + mock_migr_trans_cb.side_effect = migr_trans_cb_side_effect + + # change the state, it should close the socket causing the reply to fail + transition_to_state(ctx, client.sock, VFIO_USER_DEVICE_STATE_STOP_COPY, + rsp=False, busy=True) # vfu_run_ctx will try to reset the context and to do that it needs to # quiesce the device first diff --git a/test/py/test_setup_region.py b/test/py/test_setup_region.py index 05e64574..f266ed26 100644 --- a/test/py/test_setup_region.py +++ b/test/py/test_setup_region.py @@ -111,30 +111,6 @@ def test_setup_region_bad_pci(): assert c.get_errno() == errno.EINVAL -def test_setup_region_bad_migr(): - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, size=512, - flags=(VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM)) - assert ret == -1 - assert c.get_errno() == errno.EINVAL - - f = tempfile.TemporaryFile() - f.truncate(0x2000) - - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, size=0x2000, - flags=(VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM), - fd=f.fileno()) - assert ret == -1 - assert c.get_errno() == errno.EINVAL - - mmap_areas = [(0x0, 0x1000), (0x1000, 0x1000)] - - ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_MIGR_REGION_IDX, size=0x2000, - flags=(VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM), - mmap_areas=mmap_areas, fd=f.fileno()) - assert ret == -1 - assert c.get_errno() == errno.EINVAL - - def test_setup_region_cfg_always_cb_nocb(): ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_CFG_REGION_IDX, size=PCI_CFG_SPACE_EXP_SIZE, cb=None, diff --git a/test/unit-tests.c b/test/unit-tests.c index 310eb238..fba7225a 100644 --- a/test/unit-tests.c +++ b/test/unit-tests.c @@ -398,182 +398,6 @@ typedef struct { int conn_fd; } tran_sock_t; -static void -test_migration_state_transitions(void **state UNUSED) -{ - bool (*f)(uint32_t, uint32_t) = vfio_migr_state_transition_is_valid; - uint32_t i, j; - - /* from stopped (000b): all transitions are invalid except to running */ - assert_true(f(0, 0)); - assert_true(f(0, 1)); - for (i = 2; i < 8; i++) { - assert_false(f(0, i)); - } - - /* from running (001b) */ - assert_true(f(1, 0)); - assert_true(f(1, 1)); - assert_true(f(1, 2)); - assert_true(f(1, 3)); - assert_true(f(1, 4)); - assert_false(f(1, 5)); - assert_true(f(1, 6)); - assert_false(f(1, 5)); - - /* from stop-and-copy (010b) */ - assert_true(f(2, 0)); - assert_true(f(2, 1)); - assert_true(f(2, 2)); - assert_false(f(2, 3)); - assert_false(f(2, 4)); - assert_false(f(2, 5)); - assert_true(f(2, 6)); - assert_false(f(2, 7)); - - /* from pre-copy (011b) */ - assert_true(f(3, 0)); - assert_true(f(3, 1)); - assert_true(f(3, 2)); - assert_false(f(3, 3)); - assert_false(f(3, 4)); - assert_false(f(3, 5)); - assert_true(f(3, 6)); - assert_false(f(3, 7)); - - /* from resuming (100b) */ - assert_false(f(4, 0)); - assert_true(f(4, 1)); - assert_false(f(4, 2)); - assert_false(f(4, 3)); - assert_true(f(4, 4)); - assert_false(f(4, 5)); - assert_true(f(4, 6)); - assert_false(f(4, 7)); - - /* - * Transitioning to any other state from the remaining 3 states - * (101b - invalid, 110b - error, 111b - invalid) is invalid. - * Transitioning from the error state to the stopped state is possible but - * that requires a device reset, so we don't consider it a valid state - * transition. - */ - for (i = 5; i < 8; i++) { - for (j = 0; j < 8; j++) { - assert_false(f(i, j)); - } - } -} - -static struct test_setup_migr_reg_dat { - vfu_ctx_t *v; - size_t rs; /* migration registers size */ - size_t ds; /* migration data size */ - size_t s; /* migration region size*/ - const vfu_migration_callbacks_t c; -} migr_reg_data = { - .c = { - .version = VFU_MIGR_CALLBACKS_VERS, - .transition = (void *)0x1, - .get_pending_bytes = (void *)0x2, - .prepare_data = (void *)0x3, - .read_data = (void *)0x4, - .write_data = (void *)0x5, - .data_written = (void *)0x6 - } -}; - -static int -setup_test_setup_migration_region(void **state) -{ - struct test_setup_migr_reg_dat *p = &migr_reg_data; - p->v = vfu_create_ctx(VFU_TRANS_SOCK, "test", 0, NULL, - VFU_DEV_TYPE_PCI); - if (p->v == NULL) { - return -1; - } - p->rs = ROUND_UP(sizeof(struct vfio_user_migration_info), - sysconf(_SC_PAGE_SIZE)); - p->ds = sysconf(_SC_PAGE_SIZE); - p->s = p->rs + p->ds; - *state = p; - return setup(state); -} - -static vfu_ctx_t * -get_vfu_ctx(void **state) -{ - return (*((struct test_setup_migr_reg_dat **)(state)))->v; -} - -static int -teardown_test_setup_migration_region(void **state) -{ - struct test_setup_migr_reg_dat *p = *state; - vfu_destroy_ctx(p->v); - return 0; -} - -static void -test_setup_migration_region_size_ok(void **state) -{ - vfu_ctx_t *v = get_vfu_ctx(state); - int r = vfu_setup_region(v, VFU_PCI_DEV_MIGR_REGION_IDX, - vfu_get_migr_register_area_size(), NULL, - VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, NULL, 0, -1, 0); - assert_int_equal(0, r); -} - -static void -test_setup_migration_region_sparsely_mappable_valid(void **state) -{ - struct test_setup_migr_reg_dat *p = *state; - struct iovec mmap_areas[] = { - [0] = { - .iov_base = (void *)p->rs, - .iov_len = p->ds - } - }; - int r = vfu_setup_region(p->v, VFU_PCI_DEV_MIGR_REGION_IDX, p->s, NULL, - VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, mmap_areas, 1, - 0xdeadbeef, 0); - assert_int_equal(0, r); -} - -static void -test_setup_migration_callbacks_without_migration_region(void **state) -{ - struct test_setup_migr_reg_dat *p = *state; - assert_int_equal(-1, vfu_setup_device_migration_callbacks(p->v, &p->c, 0)); - assert_int_equal(EINVAL, errno); -} - -static void -test_setup_migration_callbacks_bad_data_offset(void **state) -{ - struct test_setup_migr_reg_dat *p = *state; - int r = vfu_setup_region(p->v, VFU_PCI_DEV_MIGR_REGION_IDX, p->s, NULL, - VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, NULL, 0, -1, 0); - assert_int_equal(0, r); - r = vfu_setup_device_migration_callbacks(p->v, &p->c, - vfu_get_migr_register_area_size() - 1); - assert_int_equal(-1, r); -} - -static void -test_setup_migration_callbacks(void **state) -{ - struct test_setup_migr_reg_dat *p = *state; - int r = vfu_setup_region(p->v, VFU_PCI_DEV_MIGR_REGION_IDX, p->s, NULL, - VFU_REGION_FLAG_READ | VFU_REGION_FLAG_WRITE, NULL, 0, -1, 0); - assert_int_equal(0, r); - r = vfu_setup_device_migration_callbacks(p->v, &p->c, - vfu_get_migr_register_area_size()); - assert_int_equal(0, r); - assert_non_null(p->v->migration); - /* FIXME can't validate p->v->migration because it's a private strcut, need to move it out of lib/migration.c */ -} - static void test_device_is_stopped_and_copying(UNUSED void **state) { @@ -583,19 +407,16 @@ test_device_is_stopped_and_copying(UNUSED void **state) size_t i; struct migration migration; vfu_ctx.migration = &migration; - for (i = 0; i < ARRAY_SIZE(migr_states); i++) { - if (migr_states[i].name == NULL) { - continue; - } - migration.info.device_state = i; + for (i = 0; i < VFIO_USER_DEVICE_NUM_STATES; i++) { + migration.state = i; bool r = device_is_stopped_and_copying(vfu_ctx.migration); - if (i == VFIO_DEVICE_STATE_V1_SAVING) { + if (i == VFIO_USER_DEVICE_STATE_STOP_COPY) { assert_true(r); } else { assert_false(r); } r = device_is_stopped(vfu_ctx.migration); - if (i == VFIO_DEVICE_STATE_V1_STOP) { + if (i == VFIO_USER_DEVICE_STATE_STOP) { assert_true(r); } else { assert_false(r); @@ -611,8 +432,10 @@ test_cmd_allowed_when_stopped_and_copying(UNUSED void **state) for (i = 0; i < VFIO_USER_MAX; i++) { bool r = cmd_allowed_when_stopped_and_copying(i); - if (i == VFIO_USER_REGION_READ || i == VFIO_USER_REGION_WRITE || - i == VFIO_USER_DIRTY_PAGES) { + if (i == VFIO_USER_REGION_READ || + i == VFIO_USER_REGION_WRITE || + i == VFIO_USER_DEVICE_FEATURE || + i == VFIO_USER_MIG_DATA_READ) { assert_true(r); } else { assert_false(r); @@ -623,7 +446,7 @@ test_cmd_allowed_when_stopped_and_copying(UNUSED void **state) static void test_should_exec_command(UNUSED void **state) { - struct migration migration = { { 0 } }; + struct migration migration = { 0 }; vfu_ctx.migration = &migration; @@ -675,22 +498,6 @@ main(void) cmocka_unit_test_setup(test_dma_controller_remove_region_unmapped, setup), cmocka_unit_test_setup(test_dma_addr_to_sgl, setup), cmocka_unit_test_setup(test_vfu_setup_device_dma, setup), - cmocka_unit_test_setup(test_migration_state_transitions, setup), - cmocka_unit_test_setup_teardown(test_setup_migration_region_size_ok, - setup_test_setup_migration_region, - teardown_test_setup_migration_region), - cmocka_unit_test_setup_teardown(test_setup_migration_region_sparsely_mappable_valid, - setup_test_setup_migration_region, - teardown_test_setup_migration_region), - cmocka_unit_test_setup_teardown(test_setup_migration_callbacks_without_migration_region, - setup_test_setup_migration_region, - teardown_test_setup_migration_region), - cmocka_unit_test_setup_teardown(test_setup_migration_callbacks_bad_data_offset, - setup_test_setup_migration_region, - teardown_test_setup_migration_region), - cmocka_unit_test_setup_teardown(test_setup_migration_callbacks, - setup_test_setup_migration_region, - teardown_test_setup_migration_region), cmocka_unit_test_setup(test_device_is_stopped_and_copying, setup), cmocka_unit_test_setup(test_cmd_allowed_when_stopped_and_copying, setup), cmocka_unit_test_setup(test_should_exec_command, setup),