# The current Travis default is a VM based 16.04 Xenial on GCE
# Additional builds with specific requirements for a full VM need to
# be added as additional matrix: entries later on
+os: linux
dist: xenial
language: c
compiler:
- if command -v ccache ; then ccache --show-stats ; fi
-matrix:
+jobs:
include:
- name: "GCC static (user)"
env:
- CONFIG="--target-list=x86_64-softmmu"
- CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-default"
language: python
- python:
- - "3.5"
+ python: 3.5
- name: "GCC Python 3.6 (x86_64-softmmu)"
- CONFIG="--target-list=x86_64-softmmu"
- CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-default"
language: python
- python:
- - "3.6"
+ python: 3.6
# Acceptance (Functional) tests
- name: "GCC check-tcg (some-softmmu)"
env:
- CONFIG="--enable-debug-tcg --target-list=xtensa-softmmu,arm-softmmu,aarch64-softmmu,alpha-softmmu"
- - TEST_BUILD_CMD="make -j${JOBS} build-tcg"
+ - TEST_BUILD_CMD="make build-tcg"
- TEST_CMD="make check-tcg"
- CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-debug-tcg"
- name: "GCC plugins check-tcg (some-softmmu)"
env:
- CONFIG="--enable-plugins --enable-debug-tcg --target-list=xtensa-softmmu,arm-softmmu,aarch64-softmmu,alpha-softmmu"
- - TEST_BUILD_CMD="make -j${JOBS} build-tcg"
+ - TEST_BUILD_CMD="make build-tcg"
- TEST_CMD="make check-tcg"
- CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-debug-tcg"
env:
- TEST_CMD="make check check-tcg V=1"
- CONFIG="--disable-containers --target-list=${MAIN_SOFTMMU_TARGETS},s390x-linux-user"
+ script:
+ - ( cd ${SRC_DIR} ; git submodule update --init roms/SLOF )
+ - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$?
+ - |
+ if [ "$BUILD_RC" -eq 0 ] ; then
+ mv pc-bios/s390-ccw/*.img pc-bios/ ;
+ ${TEST_CMD} ;
+ else
+ $(exit $BUILD_RC);
+ fi
# Release builds
# The make-release script expect a QEMU version, so our tag must start with a 'v'.
F: hw/s390x/ipl.*
F: pc-bios/s390-ccw/
F: pc-bios/s390-ccw.img
-F: docs/devel/s390-dasd-ipl.txt
+F: docs/devel/s390-dasd-ipl.rst
T: git https://github.com/borntraeger/qemu.git s390-next
L: qemu-s390x@nongnu.org
F: include/hw/s390x/ap-device.h
F: include/hw/s390x/ap-bridge.h
F: hw/vfio/ap.c
-F: docs/vfio-ap.txt
+F: docs/system/vfio-ap.rst
L: qemu-s390x@nongnu.org
vhost
F: include/hw/virtio/virtio-input.h
F: contrib/vhost-user-input/*
+virtio-iommu
+M: Eric Auger <eric.auger@redhat.com>
+S: Maintained
+F: hw/virtio/virtio-iommu*.c
+F: include/hw/virtio/virtio-iommu.h
+
virtio-serial
M: Laurent Vivier <lvivier@redhat.com>
R: Amit Shah <amit@kernel.org>
static int nbd_client_connect(BlockDriverState *bs, Error **errp);
+static void nbd_clear_bdrvstate(BDRVNBDState *s)
+{
+ object_unref(OBJECT(s->tlscreds));
+ qapi_free_SocketAddress(s->saddr);
+ s->saddr = NULL;
+ g_free(s->export);
+ s->export = NULL;
+ g_free(s->tlscredsid);
+ s->tlscredsid = NULL;
+ g_free(s->x_dirty_bitmap);
+ s->x_dirty_bitmap = NULL;
+}
+
static void nbd_channel_error(BDRVNBDState *s, int ret)
{
if (ret == -EIO) {
goto out;
}
- p = uri->path ? uri->path : "/";
- p += strspn(p, "/");
+ p = uri->path ? uri->path : "";
+ if (p[0] == '/') {
+ p++;
+ }
if (p[0]) {
qdict_put_str(options, "export", p);
}
error:
if (ret < 0) {
- object_unref(OBJECT(s->tlscreds));
- qapi_free_SocketAddress(s->saddr);
- g_free(s->export);
- g_free(s->tlscredsid);
- g_free(s->x_dirty_bitmap);
+ nbd_clear_bdrvstate(s);
}
qemu_opts_del(opts);
return ret;
ret = nbd_client_connect(bs, errp);
if (ret < 0) {
+ nbd_clear_bdrvstate(s);
return ret;
}
/* successfully connected */
BDRVNBDState *s = bs->opaque;
nbd_client_close(bs);
-
- object_unref(OBJECT(s->tlscreds));
- qapi_free_SocketAddress(s->saddr);
- g_free(s->export);
- g_free(s->tlscredsid);
- g_free(s->x_dirty_bitmap);
+ nbd_clear_bdrvstate(s);
}
static int64_t nbd_getlength(BlockDriverState *bs)
src->gfd.events = cond;
g_source_add_poll(gsrc, &src->gfd);
- id = g_source_attach(gsrc, NULL);
+ id = g_source_attach(gsrc, g_main_context_get_thread_default());
g_assert(id);
- g_source_unref(gsrc);
return gsrc;
}
}
}
+void vug_source_destroy(GSource *src)
+{
+ if (!src) {
+ return;
+ }
+
+ g_source_destroy(src);
+ g_source_unref(src);
+}
+
bool
vug_init(VugDev *dev, uint16_t max_queues, int socket,
vu_panic_cb panic, const VuDevIface *iface)
}
dev->fdmap = g_hash_table_new_full(NULL, NULL, NULL,
- (GDestroyNotify) g_source_destroy);
+ (GDestroyNotify) vug_source_destroy);
dev->src = vug_source_new(dev, socket, G_IO_IN, vug_watch, NULL);
g_assert(dev);
g_hash_table_unref(dev->fdmap);
- g_source_unref(dev->src);
+ vug_source_destroy(dev->src);
}
GSource *vug_source_new(VugDev *dev, int fd, GIOCondition cond,
vu_watch_cb vu_cb, gpointer data);
+void vug_source_destroy(GSource *src);
#endif /* LIBVHOST_USER_GLIB_H */
REQ(VHOST_USER_GET_INFLIGHT_FD),
REQ(VHOST_USER_SET_INFLIGHT_FD),
REQ(VHOST_USER_GPU_SET_SOCKET),
+ REQ(VHOST_USER_VRING_KICK),
REQ(VHOST_USER_MAX),
};
#undef REQ
dev->panic(dev, buf);
free(buf);
- /* FIXME: find a way to call virtio_error? */
+ /*
+ * FIXME:
+ * find a way to call virtio_error, or perhaps close the connection?
+ */
}
/* Translate guest physical address to our virtual address. */
vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
{
int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
if (index >= dev->max_queues) {
vmsg_close_fds(vmsg);
return false;
}
- if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK ||
- vmsg->fd_num != 1) {
+ if (nofd) {
+ vmsg_close_fds(vmsg);
+ return true;
+ }
+
+ if (vmsg->fd_num != 1) {
vmsg_close_fds(vmsg);
vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
return false;
vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
{
int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
dev->vq[index].kick_fd = -1;
}
- dev->vq[index].kick_fd = vmsg->fds[0];
- DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
+ dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0];
+ DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index);
dev->vq[index].started = true;
if (dev->iface->queue_set_started) {
vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
{
int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
dev->vq[index].call_fd = -1;
}
- dev->vq[index].call_fd = vmsg->fds[0];
+ dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0];
/* in case of I/O hang after reconnecting */
- if (eventfd_write(vmsg->fds[0], 1)) {
+ if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) {
return -1;
}
- DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
+ DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index);
return false;
}
vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
{
int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
dev->vq[index].err_fd = -1;
}
- dev->vq[index].err_fd = vmsg->fds[0];
+ dev->vq[index].err_fd = nofd ? -1 : vmsg->fds[0];
return false;
}
static bool
vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
{
+ /*
+ * Note that we support, but intentionally do not set,
+ * VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. This means that
+ * a device implementation can return it in its callback
+ * (get_protocol_features) if it wants to use this for
+ * simulation, but it is otherwise not desirable (if even
+ * implemented by the master.)
+ */
uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_MQ |
1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ |
1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
- 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD;
+ 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD |
+ 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
if (have_userfault()) {
features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
dev->protocol_features = vmsg->payload.u64;
+ if (vu_has_protocol_feature(dev,
+ VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
+ (!vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ) ||
+ !vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) {
+ /*
+ * The use case for using messages for kick/call is simulation, to make
+ * the kick and call synchronous. To actually get that behaviour, both
+ * of the other features are required.
+ * Theoretically, one could use only kick messages, or do them without
+ * having F_REPLY_ACK, but too many (possibly pending) messages on the
+ * socket will eventually cause the master to hang, to avoid this in
+ * scenarios where not desired enforce that the settings are in a way
+ * that actually enables the simulation case.
+ */
+ vu_panic(dev,
+ "F_IN_BAND_NOTIFICATIONS requires F_SLAVE_REQ && F_REPLY_ACK");
+ return false;
+ }
+
if (dev->iface->set_protocol_features) {
dev->iface->set_protocol_features(dev, features);
}
return false;
}
+static bool
+vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg)
+{
+ unsigned int index = vmsg->payload.state.index;
+
+ if (index >= dev->max_queues) {
+ vu_panic(dev, "Invalid queue index: %u", index);
+ return false;
+ }
+
+ DPRINT("Got kick message: handler:%p idx:%d\n",
+ dev->vq[index].handler, index);
+
+ if (!dev->vq[index].started) {
+ dev->vq[index].started = true;
+
+ if (dev->iface->queue_set_started) {
+ dev->iface->queue_set_started(dev, index, true);
+ }
+ }
+
+ if (dev->vq[index].handler) {
+ dev->vq[index].handler(dev, index);
+ }
+
+ return false;
+}
+
static bool
vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
{
return vu_get_inflight_fd(dev, vmsg);
case VHOST_USER_SET_INFLIGHT_FD:
return vu_set_inflight_fd(dev, vmsg);
+ case VHOST_USER_VRING_KICK:
+ return vu_handle_vring_kick(dev, vmsg);
default:
vmsg_close_fds(vmsg);
vu_panic(dev, "Unhandled request: %d", vmsg->request);
{
VhostUserMsg vmsg = { 0, };
int reply_requested;
- bool success = false;
+ bool need_reply, success = false;
if (!vu_message_read(dev, dev->sock, &vmsg)) {
goto end;
}
+ need_reply = vmsg.flags & VHOST_USER_NEED_REPLY_MASK;
+
reply_requested = vu_process_message(dev, &vmsg);
+ if (!reply_requested && need_reply) {
+ vmsg_set_reply_u64(&vmsg, 0);
+ reply_requested = 1;
+ }
+
if (!reply_requested) {
success = true;
goto end;
return !v || vring_need_event(vring_get_used_event(vq), new, old);
}
-void
-vu_queue_notify(VuDev *dev, VuVirtq *vq)
+static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync)
{
if (unlikely(dev->broken) ||
unlikely(!vq->vring.avail)) {
return;
}
+ if (vq->call_fd < 0 &&
+ vu_has_protocol_feature(dev,
+ VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
+ vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
+ VhostUserMsg vmsg = {
+ .request = VHOST_USER_SLAVE_VRING_CALL,
+ .flags = VHOST_USER_VERSION,
+ .size = sizeof(vmsg.payload.state),
+ .payload.state = {
+ .index = vq - dev->vq,
+ },
+ };
+ bool ack = sync &&
+ vu_has_protocol_feature(dev,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+ if (ack) {
+ vmsg.flags |= VHOST_USER_NEED_REPLY_MASK;
+ }
+
+ vu_message_write(dev, dev->slave_fd, &vmsg);
+ if (ack) {
+ vu_message_read(dev, dev->slave_fd, &vmsg);
+ }
+ return;
+ }
+
if (eventfd_write(vq->call_fd, 1) < 0) {
vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
}
}
+void vu_queue_notify(VuDev *dev, VuVirtq *vq)
+{
+ _vu_queue_notify(dev, vq, false);
+}
+
+void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq)
+{
+ _vu_queue_notify(dev, vq, true);
+}
+
static inline void
vring_used_flags_set_bit(VuVirtq *vq, int mask)
{
VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
+ VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
VHOST_USER_PROTOCOL_F_MAX
};
VHOST_USER_GET_INFLIGHT_FD = 31,
VHOST_USER_SET_INFLIGHT_FD = 32,
VHOST_USER_GPU_SET_SOCKET = 33,
+ VHOST_USER_VRING_KICK = 35,
VHOST_USER_MAX
} VhostUserRequest;
VHOST_USER_SLAVE_IOTLB_MSG = 1,
VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+ VHOST_USER_SLAVE_VRING_CALL = 4,
+ VHOST_USER_SLAVE_VRING_ERR = 5,
VHOST_USER_SLAVE_MAX
} VhostUserSlaveRequest;
*/
void vu_queue_notify(VuDev *dev, VuVirtq *vq);
+/**
+ * vu_queue_notify_sync:
+ * @dev: a VuDev context
+ * @vq: a VuVirtq queue
+ *
+ * Request to notify the queue via callfd (skipped if unnecessary)
+ * or sync message if possible.
+ */
+void vu_queue_notify_sync(VuDev *dev, VuVirtq *vq);
+
/**
* vu_queue_pop:
* @dev: a VuDev context
}
if (!started && vi->evsrc) {
- g_source_destroy(vi->evsrc);
+ vug_source_destroy(vi->evsrc);
vi->evsrc = NULL;
}
}
vug_deinit(&vi.dev);
- if (vi.evsrc) {
- g_source_unref(vi.evsrc);
- }
+ vug_source_destroy(vi.evsrc);
g_array_free(vi.config, TRUE);
g_free(vi.queue);
return 0;
tcg-plugins
bitops
reset
+ s390-dasd-ipl
--- /dev/null
+Booting from real channel-attached devices on s390x
+===================================================
+
+s390 hardware IPL
+-----------------
+
+The s390 hardware IPL process consists of the following steps.
+
+1. A READ IPL ccw is constructed in memory location ``0x0``.
+ This ccw, by definition, reads the IPL1 record which is located on the disk
+ at cylinder 0 track 0 record 1. Note that the chain flag is on in this ccw
+ so when it is complete another ccw will be fetched and executed from memory
+ location ``0x08``.
+
+2. Execute the Read IPL ccw at ``0x00``, thereby reading IPL1 data into ``0x00``.
+ IPL1 data is 24 bytes in length and consists of the following pieces of
+ information: ``[psw][read ccw][tic ccw]``. When the machine executes the Read
+ IPL ccw it read the 24-bytes of IPL1 to be read into memory starting at
+ location ``0x0``. Then the ccw program at ``0x08`` which consists of a read
+ ccw and a tic ccw is automatically executed because of the chain flag from
+ the original READ IPL ccw. The read ccw will read the IPL2 data into memory
+ and the TIC (Transfer In Channel) will transfer control to the channel
+ program contained in the IPL2 data. The TIC channel command is the
+ equivalent of a branch/jump/goto instruction for channel programs.
+
+ NOTE: The ccws in IPL1 are defined by the architecture to be format 0.
+
+3. Execute IPL2.
+ The TIC ccw instruction at the end of the IPL1 channel program will begin
+ the execution of the IPL2 channel program. IPL2 is stage-2 of the boot
+ process and will contain a larger channel program than IPL1. The point of
+ IPL2 is to find and load either the operating system or a small program that
+ loads the operating system from disk. At the end of this step all or some of
+ the real operating system is loaded into memory and we are ready to hand
+ control over to the guest operating system. At this point the guest
+ operating system is entirely responsible for loading any more data it might
+ need to function.
+
+ NOTE: The IPL2 channel program might read data into memory
+ location ``0x0`` thereby overwriting the IPL1 psw and channel program. This is ok
+ as long as the data placed in location ``0x0`` contains a psw whose instruction
+ address points to the guest operating system code to execute at the end of
+ the IPL/boot process.
+
+ NOTE: The ccws in IPL2 are defined by the architecture to be format 0.
+
+4. Start executing the guest operating system.
+ The psw that was loaded into memory location ``0x0`` as part of the ipl process
+ should contain the needed flags for the operating system we have loaded. The
+ psw's instruction address will point to the location in memory where we want
+ to start executing the operating system. This psw is loaded (via LPSW
+ instruction) causing control to be passed to the operating system code.
+
+In a non-virtualized environment this process, handled entirely by the hardware,
+is kicked off by the user initiating a "Load" procedure from the hardware
+management console. This "Load" procedure crafts a special "Read IPL" ccw in
+memory location 0x0 that reads IPL1. It then executes this ccw thereby kicking
+off the reading of IPL1 data. Since the channel program from IPL1 will be
+written immediately after the special "Read IPL" ccw, the IPL1 channel program
+will be executed immediately (the special read ccw has the chaining bit turned
+on). The TIC at the end of the IPL1 channel program will cause the IPL2 channel
+program to be executed automatically. After this sequence completes the "Load"
+procedure then loads the psw from ``0x0``.
+
+How this all pertains to QEMU (and the kernel)
+----------------------------------------------
+
+In theory we should merely have to do the following to IPL/boot a guest
+operating system from a DASD device:
+
+1. Place a "Read IPL" ccw into memory location ``0x0`` with chaining bit on.
+2. Execute channel program at ``0x0``.
+3. LPSW ``0x0``.
+
+However, our emulation of the machine's channel program logic within the kernel
+is missing one key feature that is required for this process to work:
+non-prefetch of ccw data.
+
+When we start a channel program we pass the channel subsystem parameters via an
+ORB (Operation Request Block). One of those parameters is a prefetch bit. If the
+bit is on then the vfio-ccw kernel driver is allowed to read the entire channel
+program from guest memory before it starts executing it. This means that any
+channel commands that read additional channel commands will not work as expected
+because the newly read commands will only exist in guest memory and NOT within
+the kernel's channel subsystem memory. The kernel vfio-ccw driver currently
+requires this bit to be on for all channel programs. This is a problem because
+the IPL process consists of transferring control from the "Read IPL" ccw
+immediately to the IPL1 channel program that was read by "Read IPL".
+
+Not being able to turn off prefetch will also prevent the TIC at the end of the
+IPL1 channel program from transferring control to the IPL2 channel program.
+
+Lastly, in some cases (the zipl bootloader for example) the IPL2 program also
+transfers control to another channel program segment immediately after reading
+it from the disk. So we need to be able to handle this case.
+
+What QEMU does
+--------------
+
+Since we are forced to live with prefetch we cannot use the very simple IPL
+procedure we defined in the preceding section. So we compensate by doing the
+following.
+
+1. Place "Read IPL" ccw into memory location ``0x0``, but turn off chaining bit.
+2. Execute "Read IPL" at ``0x0``.
+
+ So now IPL1's psw is at ``0x0`` and IPL1's channel program is at ``0x08``.
+
+3. Write a custom channel program that will seek to the IPL2 record and then
+ execute the READ and TIC ccws from IPL1. Normally the seek is not required
+ because after reading the IPL1 record the disk is automatically positioned
+ to read the very next record which will be IPL2. But since we are not reading
+ both IPL1 and IPL2 as part of the same channel program we must manually set
+ the position.
+
+4. Grab the target address of the TIC instruction from the IPL1 channel program.
+ This address is where the IPL2 channel program starts.
+
+ Now IPL2 is loaded into memory somewhere, and we know the address.
+
+5. Execute the IPL2 channel program at the address obtained in step #4.
+
+ Because this channel program can be dynamic, we must use a special algorithm
+ that detects a READ immediately followed by a TIC and breaks the ccw chain
+ by turning off the chain bit in the READ ccw. When control is returned from
+ the kernel/hardware to the QEMU bios code we immediately issue another start
+ subchannel to execute the remaining TIC instruction. This causes the entire
+ channel program (starting from the TIC) and all needed data to be refetched
+ thereby stepping around the limitation that would otherwise prevent this
+ channel program from executing properly.
+
+ Now the operating system code is loaded somewhere in guest memory and the psw
+ in memory location ``0x0`` will point to entry code for the guest operating
+ system.
+
+6. LPSW ``0x0``
+
+ LPSW transfers control to the guest operating system and we're done.
+++ /dev/null
-*****************************
-***** s390 hardware IPL *****
-*****************************
-
-The s390 hardware IPL process consists of the following steps.
-
-1. A READ IPL ccw is constructed in memory location 0x0.
- This ccw, by definition, reads the IPL1 record which is located on the disk
- at cylinder 0 track 0 record 1. Note that the chain flag is on in this ccw
- so when it is complete another ccw will be fetched and executed from memory
- location 0x08.
-
-2. Execute the Read IPL ccw at 0x00, thereby reading IPL1 data into 0x00.
- IPL1 data is 24 bytes in length and consists of the following pieces of
- information: [psw][read ccw][tic ccw]. When the machine executes the Read
- IPL ccw it read the 24-bytes of IPL1 to be read into memory starting at
- location 0x0. Then the ccw program at 0x08 which consists of a read
- ccw and a tic ccw is automatically executed because of the chain flag from
- the original READ IPL ccw. The read ccw will read the IPL2 data into memory
- and the TIC (Transfer In Channel) will transfer control to the channel
- program contained in the IPL2 data. The TIC channel command is the
- equivalent of a branch/jump/goto instruction for channel programs.
- NOTE: The ccws in IPL1 are defined by the architecture to be format 0.
-
-3. Execute IPL2.
- The TIC ccw instruction at the end of the IPL1 channel program will begin
- the execution of the IPL2 channel program. IPL2 is stage-2 of the boot
- process and will contain a larger channel program than IPL1. The point of
- IPL2 is to find and load either the operating system or a small program that
- loads the operating system from disk. At the end of this step all or some of
- the real operating system is loaded into memory and we are ready to hand
- control over to the guest operating system. At this point the guest
- operating system is entirely responsible for loading any more data it might
- need to function. NOTE: The IPL2 channel program might read data into memory
- location 0 thereby overwriting the IPL1 psw and channel program. This is ok
- as long as the data placed in location 0 contains a psw whose instruction
- address points to the guest operating system code to execute at the end of
- the IPL/boot process.
- NOTE: The ccws in IPL2 are defined by the architecture to be format 0.
-
-4. Start executing the guest operating system.
- The psw that was loaded into memory location 0 as part of the ipl process
- should contain the needed flags for the operating system we have loaded. The
- psw's instruction address will point to the location in memory where we want
- to start executing the operating system. This psw is loaded (via LPSW
- instruction) causing control to be passed to the operating system code.
-
-In a non-virtualized environment this process, handled entirely by the hardware,
-is kicked off by the user initiating a "Load" procedure from the hardware
-management console. This "Load" procedure crafts a special "Read IPL" ccw in
-memory location 0x0 that reads IPL1. It then executes this ccw thereby kicking
-off the reading of IPL1 data. Since the channel program from IPL1 will be
-written immediately after the special "Read IPL" ccw, the IPL1 channel program
-will be executed immediately (the special read ccw has the chaining bit turned
-on). The TIC at the end of the IPL1 channel program will cause the IPL2 channel
-program to be executed automatically. After this sequence completes the "Load"
-procedure then loads the psw from 0x0.
-
-**********************************************************
-***** How this all pertains to QEMU (and the kernel) *****
-**********************************************************
-
-In theory we should merely have to do the following to IPL/boot a guest
-operating system from a DASD device:
-
-1. Place a "Read IPL" ccw into memory location 0x0 with chaining bit on.
-2. Execute channel program at 0x0.
-3. LPSW 0x0.
-
-However, our emulation of the machine's channel program logic within the kernel
-is missing one key feature that is required for this process to work:
-non-prefetch of ccw data.
-
-When we start a channel program we pass the channel subsystem parameters via an
-ORB (Operation Request Block). One of those parameters is a prefetch bit. If the
-bit is on then the vfio-ccw kernel driver is allowed to read the entire channel
-program from guest memory before it starts executing it. This means that any
-channel commands that read additional channel commands will not work as expected
-because the newly read commands will only exist in guest memory and NOT within
-the kernel's channel subsystem memory. The kernel vfio-ccw driver currently
-requires this bit to be on for all channel programs. This is a problem because
-the IPL process consists of transferring control from the "Read IPL" ccw
-immediately to the IPL1 channel program that was read by "Read IPL".
-
-Not being able to turn off prefetch will also prevent the TIC at the end of the
-IPL1 channel program from transferring control to the IPL2 channel program.
-
-Lastly, in some cases (the zipl bootloader for example) the IPL2 program also
-transfers control to another channel program segment immediately after reading
-it from the disk. So we need to be able to handle this case.
-
-**************************
-***** What QEMU does *****
-**************************
-
-Since we are forced to live with prefetch we cannot use the very simple IPL
-procedure we defined in the preceding section. So we compensate by doing the
-following.
-
-1. Place "Read IPL" ccw into memory location 0x0, but turn off chaining bit.
-2. Execute "Read IPL" at 0x0.
-
- So now IPL1's psw is at 0x0 and IPL1's channel program is at 0x08.
-
-4. Write a custom channel program that will seek to the IPL2 record and then
- execute the READ and TIC ccws from IPL1. Normally the seek is not required
- because after reading the IPL1 record the disk is automatically positioned
- to read the very next record which will be IPL2. But since we are not reading
- both IPL1 and IPL2 as part of the same channel program we must manually set
- the position.
-
-5. Grab the target address of the TIC instruction from the IPL1 channel program.
- This address is where the IPL2 channel program starts.
-
- Now IPL2 is loaded into memory somewhere, and we know the address.
-
-6. Execute the IPL2 channel program at the address obtained in step #5.
-
- Because this channel program can be dynamic, we must use a special algorithm
- that detects a READ immediately followed by a TIC and breaks the ccw chain
- by turning off the chain bit in the READ ccw. When control is returned from
- the kernel/hardware to the QEMU bios code we immediately issue another start
- subchannel to execute the remaining TIC instruction. This causes the entire
- channel program (starting from the TIC) and all needed data to be refetched
- thereby stepping around the limitation that would otherwise prevent this
- channel program from executing properly.
-
- Now the operating system code is loaded somewhere in guest memory and the psw
- in memory location 0x0 will point to entry code for the guest operating
- system.
-
-7. LPSW 0x0.
- LPSW transfers control to the guest operating system and we're done.
conceptions such as translation time and translation blocks the
details are opaque to plugins. The plugin is able to query select
details of instructions and system configuration only through the
-exported *qemu_plugin* functions. The types used to describe
-instructions and events are opaque to the plugins themselves.
+exported *qemu_plugin* functions.
+
+Query Handle Lifetime
+---------------------
+
+Each callback provides an opaque anonymous information handle which
+can usually be further queried to find out information about a
+translation, instruction or operation. The handles themselves are only
+valid during the lifetime of the callback so it is important that any
+information that is needed is extracted during the callback and saved
+by the plugin.
Usage
=====
Vhost-user Protocol
===================
:Copyright: 2014 Virtual Open Systems Sarl.
+:Copyright: 2019 Intel Corporation
:Licence: This work is licensed under the terms of the GNU GPL,
version 2 or later. See the COPYING file in the top-level
directory.
reply it will close the connection. An optional reconnection mechanism
can be implemented.
+If *slave* detects some error such as incompatible features, it may also
+close the connection. This should only happen in exceptional circumstances.
+
Any protocol extensions are gated by protocol feature bits, which
allows full backwards compatibility on both master and slave. As
older slaves don't support negotiating protocol features, a feature
Client must start ring upon receiving a kick (that is, detecting that
file descriptor is readable) on the descriptor specified by
-``VHOST_USER_SET_VRING_KICK``, and stop ring upon receiving
+``VHOST_USER_SET_VRING_KICK`` or receiving the in-band message
+``VHOST_USER_VRING_KICK`` if negotiated, and stop ring upon receiving
``VHOST_USER_GET_VRING_BASE``.
While processing the rings (whether they are enabled or not), client
#. Resubmit inflight ``DescStatePacked`` entries in order of their
counter value
+In-band notifications
+---------------------
+
+In some limited situations (e.g. for simulation) it is desirable to
+have the kick, call and error (if used) signals done via in-band
+messages instead of asynchronous eventfd notifications. This can be
+done by negotiating the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS``
+protocol feature.
+
+Note that due to the fact that too many messages on the sockets can
+cause the sending application(s) to block, it is not advised to use
+this feature unless absolutely necessary. It is also considered an
+error to negotiate this feature without also negotiating
+``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` and ``VHOST_USER_PROTOCOL_F_REPLY_ACK``,
+the former is necessary for getting a message channel from the slave
+to the master, while the latter needs to be used with the in-band
+notification messages to block until they are processed, both to avoid
+blocking later and for proper processing (at least in the simulation
+use case.) As it has no other way of signalling this error, the slave
+should close the connection as a response to a
+``VHOST_USER_SET_PROTOCOL_FEATURES`` message that sets the in-band
+notifications feature flag without the other two.
+
Protocol features
-----------------
.. code:: c
- #define VHOST_USER_PROTOCOL_F_MQ 0
- #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
- #define VHOST_USER_PROTOCOL_F_RARP 2
- #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
- #define VHOST_USER_PROTOCOL_F_MTU 4
- #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
- #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6
- #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
- #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
- #define VHOST_USER_PROTOCOL_F_CONFIG 9
- #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
- #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
- #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
- #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13
+ #define VHOST_USER_PROTOCOL_F_MQ 0
+ #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
+ #define VHOST_USER_PROTOCOL_F_RARP 2
+ #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
+ #define VHOST_USER_PROTOCOL_F_MTU 4
+ #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
+ #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6
+ #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
+ #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
+ #define VHOST_USER_PROTOCOL_F_CONFIG 9
+ #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
+ #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
+ #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
+ #define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13
+ #define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14
Master message types
--------------------
Bits (0-7) of the payload contain the vring index. Bit 8 is the
invalid FD flag. This flag is set when there is no file descriptor
in the ancillary data. This signals that polling should be used
- instead of waiting for a kick.
+ instead of waiting for the kick. Note that if the protocol feature
+ ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` has been negotiated
+ this message isn't necessary as the ring is also started on the
+ ``VHOST_USER_VRING_KICK`` message, it may however still be used to
+ set an event file descriptor (which will be preferred over the
+ message) or to enable polling.
``VHOST_USER_SET_VRING_CALL``
:id: 13
Bits (0-7) of the payload contain the vring index. Bit 8 is the
invalid FD flag. This flag is set when there is no file descriptor
in the ancillary data. This signals that polling will be used
- instead of waiting for the call.
+ instead of waiting for the call. Note that if the protocol features
+ ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` and
+ ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` have been negotiated this message
+ isn't necessary as the ``VHOST_USER_SLAVE_VRING_CALL`` message can be
+ used, it may however still be used to set an event file descriptor
+ or to enable polling.
``VHOST_USER_SET_VRING_ERR``
:id: 14
Bits (0-7) of the payload contain the vring index. Bit 8 is the
invalid FD flag. This flag is set when there is no file descriptor
- in the ancillary data.
+ in the ancillary data. Note that if the protocol features
+ ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` and
+ ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` have been negotiated this message
+ isn't necessary as the ``VHOST_USER_SLAVE_VRING_ERR`` message can be
+ used, it may however still be used to set an event file descriptor
+ (which will be preferred over the message).
``VHOST_USER_GET_QUEUE_NUM``
:id: 17
Only valid if the ``VHOST_USER_PROTOCOL_F_RESET_DEVICE`` protocol
feature is set by the backend.
+``VHOST_USER_VRING_KICK``
+ :id: 35
+ :equivalent ioctl: N/A
+ :slave payload: vring state description
+ :master payload: N/A
+
+ When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol
+ feature has been successfully negotiated, this message may be
+ submitted by the master to indicate that a buffer was added to
+ the vring instead of signalling it using the vring's kick file
+ descriptor or having the slave rely on polling.
+
+ The state.num field is currently reserved and must be set to 0.
+
Slave message types
-------------------
``VHOST_USER_PROTOCOL_F_HOST_NOTIFIER`` protocol feature has been
successfully negotiated.
+``VHOST_USER_SLAVE_VRING_CALL``
+ :id: 4
+ :equivalent ioctl: N/A
+ :slave payload: vring state description
+ :master payload: N/A
+
+ When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol
+ feature has been successfully negotiated, this message may be
+ submitted by the slave to indicate that a buffer was used from
+ the vring instead of signalling this using the vring's call file
+ descriptor or having the master relying on polling.
+
+ The state.num field is currently reserved and must be set to 0.
+
+``VHOST_USER_SLAVE_VRING_ERR``
+ :id: 5
+ :equivalent ioctl: N/A
+ :slave payload: vring state description
+ :master payload: N/A
+
+ When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol
+ feature has been successfully negotiated, this message may be
+ submitted by the slave to indicate that an error occurred on the
+ specific vring, instead of signalling the error file descriptor
+ set by the master via ``VHOST_USER_SET_VRING_ERR``.
+
+ The state.num field is currently reserved and must be set to 0.
+
.. _reply_ack:
VHOST_USER_PROTOCOL_F_REPLY_ACK
register in QEMU
2: following writes to 'Command data' register set OST status
register in QEMU
+ 3: following reads from 'Command data' and 'Command data 2' return
+ architecture specific CPU ID value for currently selected CPU.
other values: reserved
[0x6-0x7] reserved
[0x8] Command data: (DWORD access)
:maxdepth: 2
qemu-block-drivers
+ vfio-ap
--- /dev/null
+Adjunct Processor (AP) Device
+=============================
+
+.. contents::
+
+Introduction
+------------
+
+The IBM Adjunct Processor (AP) Cryptographic Facility is comprised
+of three AP instructions and from 1 to 256 PCIe cryptographic adapter cards.
+These AP devices provide cryptographic functions to all CPUs assigned to a
+linux system running in an IBM Z system LPAR.
+
+On s390x, AP adapter cards are exposed via the AP bus. This document
+describes how those cards may be made available to KVM guests using the
+VFIO mediated device framework.
+
+AP Architectural Overview
+-------------------------
+
+In order understand the terminology used in the rest of this document, let's
+start with some definitions:
+
+* AP adapter
+
+ An AP adapter is an IBM Z adapter card that can perform cryptographic
+ functions. There can be from 0 to 256 adapters assigned to an LPAR depending
+ on the machine model. Adapters assigned to the LPAR in which a linux host is
+ running will be available to the linux host. Each adapter is identified by a
+ number from 0 to 255; however, the maximum adapter number allowed is
+ determined by machine model. When installed, an AP adapter is accessed by
+ AP instructions executed by any CPU.
+
+* AP domain
+
+ An adapter is partitioned into domains. Each domain can be thought of as
+ a set of hardware registers for processing AP instructions. An adapter can
+ hold up to 256 domains; however, the maximum domain number allowed is
+ determined by machine model. Each domain is identified by a number from 0 to
+ 255. Domains can be further classified into two types:
+
+ * Usage domains are domains that can be accessed directly to process AP
+ commands
+
+ * Control domains are domains that are accessed indirectly by AP
+ commands sent to a usage domain to control or change the domain; for
+ example, to set a secure private key for the domain.
+
+* AP Queue
+
+ An AP queue is the means by which an AP command-request message is sent to an
+ AP usage domain inside a specific AP. An AP queue is identified by a tuple
+ comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
+ APQI corresponds to a given usage domain number within the adapter. This tuple
+ forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
+ instructions include a field containing the APQN to identify the AP queue to
+ which the AP command-request message is to be sent for processing.
+
+* AP Instructions:
+
+ There are three AP instructions:
+
+ * NQAP: to enqueue an AP command-request message to a queue
+ * DQAP: to dequeue an AP command-reply message from a queue
+ * PQAP: to administer the queues
+
+ AP instructions identify the domain that is targeted to process the AP
+ command; this must be one of the usage domains. An AP command may modify a
+ domain that is not one of the usage domains, but the modified domain
+ must be one of the control domains.
+
+Start Interpretive Execution (SIE) Instruction
+----------------------------------------------
+
+A KVM guest is started by executing the Start Interpretive Execution (SIE)
+instruction. The SIE state description is a control block that contains the
+state information for a KVM guest and is supplied as input to the SIE
+instruction. The SIE state description contains a satellite control block called
+the Crypto Control Block (CRYCB). The CRYCB contains three fields to identify
+the adapters, usage domains and control domains assigned to the KVM guest:
+
+* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
+ to the KVM guest. Each bit in the mask, from left to right, corresponds to
+ an APID from 0-255. If a bit is set, the corresponding adapter is valid for
+ use by the KVM guest.
+
+* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
+ assigned to the KVM guest. Each bit in the mask, from left to right,
+ corresponds to an AP queue index (APQI) from 0-255. If a bit is set, the
+ corresponding queue is valid for use by the KVM guest.
+
+* The AP Domain Mask field is a bit mask that identifies the AP control domains
+ assigned to the KVM guest. The ADM bit mask controls which domains can be
+ changed by an AP command-request message sent to a usage domain from the
+ guest. Each bit in the mask, from left to right, corresponds to a domain from
+ 0-255. If a bit is set, the corresponding domain can be modified by an AP
+ command-request message sent to a usage domain.
+
+If you recall from the description of an AP Queue, AP instructions include
+an APQN to identify the AP adapter and AP queue to which an AP command-request
+message is to be sent (NQAP and PQAP instructions), or from which a
+command-reply message is to be received (DQAP instruction). The validity of an
+APQN is defined by the matrix calculated from the APM and AQM; it is the
+cross product of all assigned adapter numbers (APM) with all assigned queue
+indexes (AQM). For example, if adapters 1 and 2 and usage domains 5 and 6 are
+assigned to a guest, the APQNs (1,5), (1,6), (2,5) and (2,6) will be valid for
+the guest.
+
+The APQNs can provide secure key functionality - i.e., a private key is stored
+on the adapter card for each of its domains - so each APQN must be assigned to
+at most one guest or the linux host.
+
+Example 1: Valid configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++----------+--------+--------+
+| | Guest1 | Guest2 |
++==========+========+========+
+| adapters | 1, 2 | 1, 2 |
++----------+--------+--------+
+| domains | 5, 6 | 7 |
++----------+--------+--------+
+
+This is valid because both guests have a unique set of APQNs:
+
+* Guest1 has APQNs (1,5), (1,6), (2,5) and (2,6);
+* Guest2 has APQNs (1,7) and (2,7).
+
+Example 2: Valid configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++----------+--------+--------+
+| | Guest1 | Guest2 |
++==========+========+========+
+| adapters | 1, 2 | 3, 4 |
++----------+--------+--------+
+| domains | 5, 6 | 5, 6 |
++----------+--------+--------+
+
+This is also valid because both guests have a unique set of APQNs:
+
+* Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
+* Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
+
+Example 3: Invalid configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++----------+--------+--------+
+| | Guest1 | Guest2 |
++==========+========+========+
+| adapters | 1, 2 | 1 |
++----------+--------+--------+
+| domains | 5, 6 | 6, 7 |
++----------+--------+--------+
+
+This is an invalid configuration because both guests have access to
+APQN (1,6).
+
+AP Matrix Configuration on Linux Host
+-------------------------------------
+
+A linux system is a guest of the LPAR in which it is running and has access to
+the AP resources configured for the LPAR. The LPAR's AP matrix is
+configured via its Activation Profile which can be edited on the HMC. When the
+linux system is started, the AP bus will detect the AP devices assigned to the
+LPAR and create the following in sysfs::
+
+ /sys/bus/ap
+ ... [devices]
+ ...... xx.yyyy
+ ...... ...
+ ...... cardxx
+ ...... ...
+
+Where:
+
+``cardxx``
+ is AP adapter number xx (in hex)
+
+``xx.yyyy``
+ is an APQN with xx specifying the APID and yyyy specifying the APQI
+
+For example, if AP adapters 5 and 6 and domains 4, 71 (0x47), 171 (0xab) and
+255 (0xff) are configured for the LPAR, the sysfs representation on the linux
+host system would look like this::
+
+ /sys/bus/ap
+ ... [devices]
+ ...... 05.0004
+ ...... 05.0047
+ ...... 05.00ab
+ ...... 05.00ff
+ ...... 06.0004
+ ...... 06.0047
+ ...... 06.00ab
+ ...... 06.00ff
+ ...... card05
+ ...... card06
+
+A set of default device drivers are also created to control each type of AP
+device that can be assigned to the LPAR on which a linux host is running::
+
+ /sys/bus/ap
+ ... [drivers]
+ ...... [cex2acard] for Crypto Express 2/3 accelerator cards
+ ...... [cex2aqueue] for AP queues served by Crypto Express 2/3
+ accelerator cards
+ ...... [cex4card] for Crypto Express 4/5/6 accelerator and coprocessor
+ cards
+ ...... [cex4queue] for AP queues served by Crypto Express 4/5/6
+ accelerator and coprocessor cards
+ ...... [pcixcccard] for Crypto Express 2/3 coprocessor cards
+ ...... [pcixccqueue] for AP queues served by Crypto Express 2/3
+ coprocessor cards
+
+Binding AP devices to device drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are two sysfs files that specify bitmasks marking a subset of the APQN
+range as 'usable by the default AP queue device drivers' or 'not usable by the
+default device drivers' and thus available for use by the alternate device
+driver(s). The sysfs locations of the masks are::
+
+ /sys/bus/ap/apmask
+ /sys/bus/ap/aqmask
+
+The ``apmask`` is a 256-bit mask that identifies a set of AP adapter IDs
+(APID). Each bit in the mask, from left to right (i.e., from most significant
+to least significant bit in big endian order), corresponds to an APID from
+0-255. If a bit is set, the APID is marked as usable only by the default AP
+queue device drivers; otherwise, the APID is usable by the vfio_ap
+device driver.
+
+The ``aqmask`` is a 256-bit mask that identifies a set of AP queue indexes
+(APQI). Each bit in the mask, from left to right (i.e., from most significant
+to least significant bit in big endian order), corresponds to an APQI from
+0-255. If a bit is set, the APQI is marked as usable only by the default AP
+queue device drivers; otherwise, the APQI is usable by the vfio_ap device
+driver.
+
+Take, for example, the following mask::
+
+ 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+It indicates:
+
+ 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
+ belong to the vfio_ap device driver's pool.
+
+The APQN of each AP queue device assigned to the linux host is checked by the
+AP bus against the set of APQNs derived from the cross product of APIDs
+and APQIs marked as usable only by the default AP queue device drivers. If a
+match is detected, only the default AP queue device drivers will be probed;
+otherwise, the vfio_ap device driver will be probed.
+
+By default, the two masks are set to reserve all APQNs for use by the default
+AP queue device drivers. There are two ways the default masks can be changed:
+
+ 1. The sysfs mask files can be edited by echoing a string into the
+ respective sysfs mask file in one of two formats:
+
+ * An absolute hex string starting with 0x - like "0x12345678" - sets
+ the mask. If the given string is shorter than the mask, it is padded
+ with 0s on the right; for example, specifying a mask value of 0x41 is
+ the same as specifying::
+
+ 0x4100000000000000000000000000000000000000000000000000000000000000
+
+ Keep in mind that the mask reads from left to right (i.e., most
+ significant to least significant bit in big endian order), so the mask
+ above identifies device numbers 1 and 7 (``01000001``).
+
+ If the string is longer than the mask, the operation is terminated with
+ an error (EINVAL).
+
+ * Individual bits in the mask can be switched on and off by specifying
+ each bit number to be switched in a comma separated list. Each bit
+ number string must be prepended with a (``+``) or minus (``-``) to indicate
+ the corresponding bit is to be switched on (``+``) or off (``-``). Some
+ valid values are::
+
+ "+0" switches bit 0 on
+ "-13" switches bit 13 off
+ "+0x41" switches bit 65 on
+ "-0xff" switches bit 255 off
+
+ The following example::
+
+ +0,-6,+0x47,-0xf0
+
+ Switches bits 0 and 71 (0x47) on
+ Switches bits 6 and 240 (0xf0) off
+
+ Note that the bits not specified in the list remain as they were before
+ the operation.
+
+ 2. The masks can also be changed at boot time via parameters on the kernel
+ command line like this::
+
+ ap.apmask=0xffff ap.aqmask=0x40
+
+ This would create the following masks:
+
+ apmask::
+
+ 0xffff000000000000000000000000000000000000000000000000000000000000
+
+ aqmask::
+
+ 0x4000000000000000000000000000000000000000000000000000000000000000
+
+ Resulting in these two pools::
+
+ default drivers pool: adapter 0-15, domain 1
+ alternate drivers pool: adapter 16-255, domains 0, 2-255
+
+Configuring an AP matrix for a linux guest
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The sysfs interfaces for configuring an AP matrix for a guest are built on the
+VFIO mediated device framework. To configure an AP matrix for a guest, a
+mediated matrix device must first be created for the ``/sys/devices/vfio_ap/matrix``
+device. When the vfio_ap device driver is loaded, it registers with the VFIO
+mediated device framework. When the driver registers, the sysfs interfaces for
+creating mediated matrix devices is created::
+
+ /sys/devices
+ ... [vfio_ap]
+ ......[matrix]
+ ......... [mdev_supported_types]
+ ............ [vfio_ap-passthrough]
+ ............... create
+ ............... [devices]
+
+A mediated AP matrix device is created by writing a UUID to the attribute file
+named ``create``, for example::
+
+ uuidgen > create
+
+or
+
+::
+
+ echo $uuid > create
+
+When a mediated AP matrix device is created, a sysfs directory named after
+the UUID is created in the ``devices`` subdirectory::
+
+ /sys/devices
+ ... [vfio_ap]
+ ......[matrix]
+ ......... [mdev_supported_types]
+ ............ [vfio_ap-passthrough]
+ ............... create
+ ............... [devices]
+ .................. [$uuid]
+
+There will also be three sets of attribute files created in the mediated
+matrix device's sysfs directory to configure an AP matrix for the
+KVM guest::
+
+ /sys/devices
+ ... [vfio_ap]
+ ......[matrix]
+ ......... [mdev_supported_types]
+ ............ [vfio_ap-passthrough]
+ ............... create
+ ............... [devices]
+ .................. [$uuid]
+ ..................... assign_adapter
+ ..................... assign_control_domain
+ ..................... assign_domain
+ ..................... matrix
+ ..................... unassign_adapter
+ ..................... unassign_control_domain
+ ..................... unassign_domain
+
+``assign_adapter``
+ To assign an AP adapter to the mediated matrix device, its APID is written
+ to the ``assign_adapter`` file. This may be done multiple times to assign more
+ than one adapter. The APID may be specified using conventional semantics
+ as a decimal, hexadecimal, or octal number. For example, to assign adapters
+ 4, 5 and 16 to a mediated matrix device in decimal, hexadecimal and octal
+ respectively::
+
+ echo 4 > assign_adapter
+ echo 0x5 > assign_adapter
+ echo 020 > assign_adapter
+
+ In order to successfully assign an adapter:
+
+ * The adapter number specified must represent a value from 0 up to the
+ maximum adapter number allowed by the machine model. If an adapter number
+ higher than the maximum is specified, the operation will terminate with
+ an error (ENODEV).
+
+ * All APQNs that can be derived from the adapter ID being assigned and the
+ IDs of the previously assigned domains must be bound to the vfio_ap device
+ driver. If no domains have yet been assigned, then there must be at least
+ one APQN with the specified APID bound to the vfio_ap driver. If no such
+ APQNs are bound to the driver, the operation will terminate with an
+ error (EADDRNOTAVAIL).
+
+ * No APQN that can be derived from the adapter ID and the IDs of the
+ previously assigned domains can be assigned to another mediated matrix
+ device. If an APQN is assigned to another mediated matrix device, the
+ operation will terminate with an error (EADDRINUSE).
+
+``unassign_adapter``
+ To unassign an AP adapter, its APID is written to the ``unassign_adapter``
+ file. This may also be done multiple times to unassign more than one adapter.
+
+``assign_domain``
+ To assign a usage domain, the domain number is written into the
+ ``assign_domain`` file. This may be done multiple times to assign more than one
+ usage domain. The domain number is specified using conventional semantics as
+ a decimal, hexadecimal, or octal number. For example, to assign usage domains
+ 4, 8, and 71 to a mediated matrix device in decimal, hexadecimal and octal
+ respectively::
+
+ echo 4 > assign_domain
+ echo 0x8 > assign_domain
+ echo 0107 > assign_domain
+
+ In order to successfully assign a domain:
+
+ * The domain number specified must represent a value from 0 up to the
+ maximum domain number allowed by the machine model. If a domain number
+ higher than the maximum is specified, the operation will terminate with
+ an error (ENODEV).
+
+ * All APQNs that can be derived from the domain ID being assigned and the IDs
+ of the previously assigned adapters must be bound to the vfio_ap device
+ driver. If no domains have yet been assigned, then there must be at least
+ one APQN with the specified APQI bound to the vfio_ap driver. If no such
+ APQNs are bound to the driver, the operation will terminate with an
+ error (EADDRNOTAVAIL).
+
+ * No APQN that can be derived from the domain ID being assigned and the IDs
+ of the previously assigned adapters can be assigned to another mediated
+ matrix device. If an APQN is assigned to another mediated matrix device,
+ the operation will terminate with an error (EADDRINUSE).
+
+``unassign_domain``
+ To unassign a usage domain, the domain number is written into the
+ ``unassign_domain`` file. This may be done multiple times to unassign more than
+ one usage domain.
+
+``assign_control_domain``
+ To assign a control domain, the domain number is written into the
+ ``assign_control_domain`` file. This may be done multiple times to
+ assign more than one control domain. The domain number may be specified using
+ conventional semantics as a decimal, hexadecimal, or octal number. For
+ example, to assign control domains 4, 8, and 71 to a mediated matrix device
+ in decimal, hexadecimal and octal respectively::
+
+ echo 4 > assign_domain
+ echo 0x8 > assign_domain
+ echo 0107 > assign_domain
+
+ In order to successfully assign a control domain, the domain number
+ specified must represent a value from 0 up to the maximum domain number
+ allowed by the machine model. If a control domain number higher than the
+ maximum is specified, the operation will terminate with an error (ENODEV).
+
+``unassign_control_domain``
+ To unassign a control domain, the domain number is written into the
+ ``unassign_domain`` file. This may be done multiple times to unassign more than
+ one control domain.
+
+Notes: No changes to the AP matrix will be allowed while a guest using
+the mediated matrix device is running. Attempts to assign an adapter,
+domain or control domain will be rejected and an error (EBUSY) returned.
+
+Starting a Linux Guest Configured with an AP Matrix
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To provide a mediated matrix device for use by a guest, the following option
+must be specified on the QEMU command line::
+
+ -device vfio_ap,sysfsdev=$path-to-mdev
+
+The sysfsdev parameter specifies the path to the mediated matrix device.
+There are a number of ways to specify this path::
+
+ /sys/devices/vfio_ap/matrix/$uuid
+ /sys/bus/mdev/devices/$uuid
+ /sys/bus/mdev/drivers/vfio_mdev/$uuid
+ /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough/devices/$uuid
+
+When the linux guest is started, the guest will open the mediated
+matrix device's file descriptor to get information about the mediated matrix
+device. The ``vfio_ap`` device driver will update the APM, AQM, and ADM fields in
+the guest's CRYCB with the adapter, usage domain and control domains assigned
+via the mediated matrix device's sysfs attribute files. Programs running on the
+linux guest will then:
+
+1. Have direct access to the APQNs derived from the cross product of the AP
+ adapter numbers (APID) and queue indexes (APQI) specified in the APM and AQM
+ fields of the guests's CRYCB respectively. These APQNs identify the AP queues
+ that are valid for use by the guest; meaning, AP commands can be sent by the
+ guest to any of these queues for processing.
+
+2. Have authorization to process AP commands to change a control domain
+ identified in the ADM field of the guest's CRYCB. The AP command must be sent
+ to a valid APQN (see 1 above).
+
+CPU model features:
+
+Three CPU model features are available for controlling guest access to AP
+facilities:
+
+1. AP facilities feature
+
+ The AP facilities feature indicates that AP facilities are installed on the
+ guest. This feature will be exposed for use only if the AP facilities
+ are installed on the host system. The feature is s390-specific and is
+ represented as a parameter of the -cpu option on the QEMU command line::
+
+ qemu-system-s390x -cpu $model,ap=on|off
+
+ Where:
+
+ ``$model``
+ is the CPU model defined for the guest (defaults to the model of
+ the host system if not specified).
+
+ ``ap=on|off``
+ indicates whether AP facilities are installed (on) or not
+ (off). The default for CPU models zEC12 or newer
+ is ``ap=on``. AP facilities must be installed on the guest if a
+ vfio-ap device (``-device vfio-ap,sysfsdev=$path``) is configured
+ for the guest, or the guest will fail to start.
+
+2. Query Configuration Information (QCI) facility
+
+ The QCI facility is used by the AP bus running on the guest to query the
+ configuration of the AP facilities. This facility will be available
+ only if the QCI facility is installed on the host system. The feature is
+ s390-specific and is represented as a parameter of the -cpu option on the
+ QEMU command line::
+
+ qemu-system-s390x -cpu $model,apqci=on|off
+
+ Where:
+
+ ``$model``
+ is the CPU model defined for the guest
+
+ ``apqci=on|off``
+ indicates whether the QCI facility is installed (on) or
+ not (off). The default for CPU models zEC12 or newer
+ is ``apqci=on``; for older models, QCI will not be installed.
+
+ If QCI is installed (``apqci=on``) but AP facilities are not
+ (``ap=off``), an error message will be logged, but the guest
+ will be allowed to start. It makes no sense to have QCI
+ installed if the AP facilities are not; this is considered
+ an invalid configuration.
+
+ If the QCI facility is not installed, APQNs with an APQI
+ greater than 15 will not be detected by the AP bus
+ running on the guest.
+
+3. Adjunct Process Facility Test (APFT) facility
+
+ The APFT facility is used by the AP bus running on the guest to test the
+ AP facilities available for a given AP queue. This facility will be available
+ only if the APFT facility is installed on the host system. The feature is
+ s390-specific and is represented as a parameter of the -cpu option on the
+ QEMU command line::
+
+ qemu-system-s390x -cpu $model,apft=on|off
+
+ Where:
+
+ ``$model``
+ is the CPU model defined for the guest (defaults to the model of
+ the host system if not specified).
+
+ ``apft=on|off``
+ indicates whether the APFT facility is installed (on) or
+ not (off). The default for CPU models zEC12 and
+ newer is ``apft=on`` for older models, APFT will not be
+ installed.
+
+ If APFT is installed (``apft=on``) but AP facilities are not
+ (``ap=off``), an error message will be logged, but the guest
+ will be allowed to start. It makes no sense to have APFT
+ installed if the AP facilities are not; this is considered
+ an invalid configuration.
+
+ It also makes no sense to turn APFT off because the AP bus
+ running on the guest will not detect CEX4 and newer devices
+ without it. Since only CEX4 and newer devices are supported
+ for guest usage, no AP devices can be made accessible to a
+ guest started without APFT installed.
+
+Hot plug a vfio-ap device into a running guest
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Only one vfio-ap device can be attached to the virtual machine's ap-bus, so a
+vfio-ap device can be hot plugged if and only if no vfio-ap device is attached
+to the bus already, whether via the QEMU command line or a prior hot plug
+action.
+
+To hot plug a vfio-ap device, use the QEMU ``device_add`` command::
+
+ (qemu) device_add vfio-ap,sysfsdev="$path-to-mdev"
+
+Where the ``$path-to-mdev`` value specifies the absolute path to a mediated
+device to which AP resources to be used by the guest have been assigned.
+
+Note that on Linux guests, the AP devices will be created in the
+``/sys/bus/ap/devices`` directory when the AP bus subsequently performs its periodic
+scan, so there may be a short delay before the AP devices are accessible on the
+guest.
+
+The command will fail if:
+
+* A vfio-ap device has already been attached to the virtual machine's ap-bus.
+
+* The CPU model features for controlling guest access to AP facilities are not
+ enabled (see 'CPU model features' subsection in the previous section).
+
+Hot unplug a vfio-ap device from a running guest
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A vfio-ap device can be unplugged from a running KVM guest if a vfio-ap device
+has been attached to the virtual machine's ap-bus via the QEMU command line
+or a prior hot plug action.
+
+To hot unplug a vfio-ap device, use the QEMU ``device_del`` command::
+
+ (qemu) device_del vfio-ap,sysfsdev="$path-to-mdev"
+
+Where ``$path-to-mdev`` is the same as the path specified when the vfio-ap
+device was attached to the virtual machine's ap-bus.
+
+On a Linux guest, the AP devices will be removed from the ``/sys/bus/ap/devices``
+directory on the guest when the AP bus subsequently performs its periodic scan,
+so there may be a short delay before the AP devices are no longer accessible by
+the guest.
+
+The command will fail if the ``$path-to-mdev`` specified on the ``device_del`` command
+does not match the value specified when the vfio-ap device was attached to
+the virtual machine's ap-bus.
+
+Example: Configure AP Matrices for Three Linux Guests
+-----------------------------------------------------
+
+Let's now provide an example to illustrate how KVM guests may be given
+access to AP facilities. For this example, we will show how to configure
+three guests such that executing the lszcrypt command on the guests would
+look like this:
+
+Guest1::
+
+ CARD.DOMAIN TYPE MODE
+ ------------------------------
+ 05 CEX5C CCA-Coproc
+ 05.0004 CEX5C CCA-Coproc
+ 05.00ab CEX5C CCA-Coproc
+ 06 CEX5A Accelerator
+ 06.0004 CEX5A Accelerator
+ 06.00ab CEX5C CCA-Coproc
+
+Guest2::
+
+ CARD.DOMAIN TYPE MODE
+ ------------------------------
+ 05 CEX5A Accelerator
+ 05.0047 CEX5A Accelerator
+ 05.00ff CEX5A Accelerator
+
+Guest3::
+
+ CARD.DOMAIN TYPE MODE
+ ------------------------------
+ 06 CEX5A Accelerator
+ 06.0047 CEX5A Accelerator
+ 06.00ff CEX5A Accelerator
+
+These are the steps:
+
+1. Install the vfio_ap module on the linux host. The dependency chain for the
+ vfio_ap module is:
+
+ * iommu
+ * s390
+ * zcrypt
+ * vfio
+ * vfio_mdev
+ * vfio_mdev_device
+ * KVM
+
+ To build the vfio_ap module, the kernel build must be configured with the
+ following Kconfig elements selected:
+
+ * IOMMU_SUPPORT
+ * S390
+ * ZCRYPT
+ * S390_AP_IOMMU
+ * VFIO
+ * VFIO_MDEV
+ * VFIO_MDEV_DEVICE
+ * KVM
+
+ If using make menuconfig select the following to build the vfio_ap module::
+ -> Device Drivers
+ -> IOMMU Hardware Support
+ select S390 AP IOMMU Support
+ -> VFIO Non-Privileged userspace driver framework
+ -> Mediated device driver framework
+ -> VFIO driver for Mediated devices
+ -> I/O subsystem
+ -> VFIO support for AP devices
+
+2. Secure the AP queues to be used by the three guests so that the host can not
+ access them. To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff,
+ 06.0004, 06.0047, 06.00ab, and 06.00ff for use by the vfio_ap device driver,
+ the corresponding APQNs must be removed from the default queue drivers pool
+ as follows::
+
+ echo -5,-6 > /sys/bus/ap/apmask
+
+ echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
+
+ This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
+ 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
+ sysfs directory for the vfio_ap device driver will now contain symbolic links
+ to the AP queue devices bound to it::
+
+ /sys/bus/ap
+ ... [drivers]
+ ...... [vfio_ap]
+ ......... [05.0004]
+ ......... [05.0047]
+ ......... [05.00ab]
+ ......... [05.00ff]
+ ......... [06.0004]
+ ......... [06.0047]
+ ......... [06.00ab]
+ ......... [06.00ff]
+
+ Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
+ can be bound to the vfio_ap device driver. The reason for this is to
+ simplify the implementation by not needlessly complicating the design by
+ supporting older devices that will go out of service in the relatively near
+ future, and for which there are few older systems on which to test.
+
+ The administrator, therefore, must take care to secure only AP queues that
+ can be bound to the vfio_ap device driver. The device type for a given AP
+ queue device can be read from the parent card's sysfs directory. For example,
+ to see the hardware type of the queue 05.0004::
+
+ cat /sys/bus/ap/devices/card05/hwtype
+
+ The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
+ vfio_ap device driver.
+
+3. Create the mediated devices needed to configure the AP matrixes for the
+ three guests and to provide an interface to the vfio_ap driver for
+ use by the guests::
+
+ /sys/devices/vfio_ap/matrix/
+ ... [mdev_supported_types]
+ ...... [vfio_ap-passthrough] (passthrough mediated matrix device type)
+ ......... create
+ ......... [devices]
+
+ To create the mediated devices for the three guests::
+
+ uuidgen > create
+ uuidgen > create
+ uuidgen > create
+
+ or
+
+ ::
+
+ echo $uuid1 > create
+ echo $uuid2 > create
+ echo $uuid3 > create
+
+ This will create three mediated devices in the [devices] subdirectory named
+ after the UUID used to create the mediated device. We'll call them $uuid1,
+ $uuid2 and $uuid3 and this is the sysfs directory structure after creation::
+
+ /sys/devices/vfio_ap/matrix/
+ ... [mdev_supported_types]
+ ...... [vfio_ap-passthrough]
+ ......... [devices]
+ ............ [$uuid1]
+ ............... assign_adapter
+ ............... assign_control_domain
+ ............... assign_domain
+ ............... matrix
+ ............... unassign_adapter
+ ............... unassign_control_domain
+ ............... unassign_domain
+
+ ............ [$uuid2]
+ ............... assign_adapter
+ ............... assign_control_domain
+ ............... assign_domain
+ ............... matrix
+ ............... unassign_adapter
+ ............... unassign_control_domain
+ ............... unassign_domain
+
+ ............ [$uuid3]
+ ............... assign_adapter
+ ............... assign_control_domain
+ ............... assign_domain
+ ............... matrix
+ ............... unassign_adapter
+ ............... unassign_control_domain
+ ............... unassign_domain
+
+4. The administrator now needs to configure the matrixes for the mediated
+ devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
+
+ This is how the matrix is configured for Guest1::
+
+ echo 5 > assign_adapter
+ echo 6 > assign_adapter
+ echo 4 > assign_domain
+ echo 0xab > assign_domain
+
+ Control domains can similarly be assigned using the assign_control_domain
+ sysfs file.
+
+ If a mistake is made configuring an adapter, domain or control domain,
+ you can use the ``unassign_xxx`` interfaces to unassign the adapter, domain or
+ control domain.
+
+ To display the matrix configuration for Guest1::
+
+ cat matrix
+
+ The output will display the APQNs in the format ``xx.yyyy``, where xx is
+ the adapter number and yyyy is the domain number. The output for Guest1
+ will look like this::
+
+ 05.0004
+ 05.00ab
+ 06.0004
+ 06.00ab
+
+ This is how the matrix is configured for Guest2::
+
+ echo 5 > assign_adapter
+ echo 0x47 > assign_domain
+ echo 0xff > assign_domain
+
+ This is how the matrix is configured for Guest3::
+
+ echo 6 > assign_adapter
+ echo 0x47 > assign_domain
+ echo 0xff > assign_domain
+
+5. Start Guest1::
+
+ /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
+
+7. Start Guest2::
+
+ /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
+
+7. Start Guest3::
+
+ /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
+
+When the guest is shut down, the mediated matrix devices may be removed.
+
+Using our example again, to remove the mediated matrix device $uuid1::
+
+ /sys/devices/vfio_ap/matrix/
+ ... [mdev_supported_types]
+ ...... [vfio_ap-passthrough]
+ ......... [devices]
+ ............ [$uuid1]
+ ............... remove
+
+
+ echo 1 > remove
+
+This will remove all of the mdev matrix device's sysfs structures including
+the mdev device itself. To recreate and reconfigure the mdev matrix device,
+all of the steps starting with step 3 will have to be performed again. Note
+that the remove will fail if a guest using the mdev is still running.
+
+It is not necessary to remove an mdev matrix device, but one may want to
+remove it if no guest will use it during the remaining lifetime of the linux
+host. If the mdev matrix device is removed, one may want to also reconfigure
+the pool of adapters and queues reserved for use by the default drivers.
+
+Limitations
+-----------
+
+* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
+ to the default drivers pool of a queue that is still assigned to a mediated
+ device in use by a guest. It is incumbent upon the administrator to
+ ensure there is no mediated device in use by a guest to which the APQN is
+ assigned lest the host be given access to the private data of the AP queue
+ device, such as a private key configured specifically for the guest.
+
+* Dynamically assigning AP resources to or unassigning AP resources from a
+ mediated matrix device - see `Configuring an AP matrix for a linux guest`_
+ section above - while a running guest is using it is currently not supported.
+
+* Live guest migration is not supported for guests using AP devices. If a guest
+ is using AP devices, the vfio-ap device configured for the guest must be
+ unplugged before migrating the guest (see `Hot unplug a vfio-ap device from a
+ running guest`_ section above.)
+++ /dev/null
-Adjunct Processor (AP) Device
-=============================
-
-Contents:
-=========
-* Introduction
-* AP Architectural Overview
-* Start Interpretive Execution (SIE) Instruction
-* AP Matrix Configuration on Linux Host
-* Starting a Linux Guest Configured with an AP Matrix
-* Example: Configure AP Matrices for Three Linux Guests
-
-Introduction:
-============
-The IBM Adjunct Processor (AP) Cryptographic Facility is comprised
-of three AP instructions and from 1 to 256 PCIe cryptographic adapter cards.
-These AP devices provide cryptographic functions to all CPUs assigned to a
-linux system running in an IBM Z system LPAR.
-
-On s390x, AP adapter cards are exposed via the AP bus. This document
-describes how those cards may be made available to KVM guests using the
-VFIO mediated device framework.
-
-AP Architectural Overview:
-=========================
-In order understand the terminology used in the rest of this document, let's
-start with some definitions:
-
-* AP adapter
-
- An AP adapter is an IBM Z adapter card that can perform cryptographic
- functions. There can be from 0 to 256 adapters assigned to an LPAR depending
- on the machine model. Adapters assigned to the LPAR in which a linux host is
- running will be available to the linux host. Each adapter is identified by a
- number from 0 to 255; however, the maximum adapter number allowed is
- determined by machine model. When installed, an AP adapter is accessed by
- AP instructions executed by any CPU.
-
-* AP domain
-
- An adapter is partitioned into domains. Each domain can be thought of as
- a set of hardware registers for processing AP instructions. An adapter can
- hold up to 256 domains; however, the maximum domain number allowed is
- determined by machine model. Each domain is identified by a number from 0 to
- 255. Domains can be further classified into two types:
-
- * Usage domains are domains that can be accessed directly to process AP
- commands
-
- * Control domains are domains that are accessed indirectly by AP
- commands sent to a usage domain to control or change the domain; for
- example, to set a secure private key for the domain.
-
-* AP Queue
-
- An AP queue is the means by which an AP command-request message is sent to an
- AP usage domain inside a specific AP. An AP queue is identified by a tuple
- comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
- APQI corresponds to a given usage domain number within the adapter. This tuple
- forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
- instructions include a field containing the APQN to identify the AP queue to
- which the AP command-request message is to be sent for processing.
-
-* AP Instructions:
-
- There are three AP instructions:
-
- * NQAP: to enqueue an AP command-request message to a queue
- * DQAP: to dequeue an AP command-reply message from a queue
- * PQAP: to administer the queues
-
- AP instructions identify the domain that is targeted to process the AP
- command; this must be one of the usage domains. An AP command may modify a
- domain that is not one of the usage domains, but the modified domain
- must be one of the control domains.
-
-Start Interpretive Execution (SIE) Instruction
-==============================================
-A KVM guest is started by executing the Start Interpretive Execution (SIE)
-instruction. The SIE state description is a control block that contains the
-state information for a KVM guest and is supplied as input to the SIE
-instruction. The SIE state description contains a satellite control block called
-the Crypto Control Block (CRYCB). The CRYCB contains three fields to identify
-the adapters, usage domains and control domains assigned to the KVM guest:
-
-* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
- to the KVM guest. Each bit in the mask, from left to right, corresponds to
- an APID from 0-255. If a bit is set, the corresponding adapter is valid for
- use by the KVM guest.
-
-* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
- assigned to the KVM guest. Each bit in the mask, from left to right,
- corresponds to an AP queue index (APQI) from 0-255. If a bit is set, the
- corresponding queue is valid for use by the KVM guest.
-
-* The AP Domain Mask field is a bit mask that identifies the AP control domains
- assigned to the KVM guest. The ADM bit mask controls which domains can be
- changed by an AP command-request message sent to a usage domain from the
- guest. Each bit in the mask, from left to right, corresponds to a domain from
- 0-255. If a bit is set, the corresponding domain can be modified by an AP
- command-request message sent to a usage domain.
-
-If you recall from the description of an AP Queue, AP instructions include
-an APQN to identify the AP adapter and AP queue to which an AP command-request
-message is to be sent (NQAP and PQAP instructions), or from which a
-command-reply message is to be received (DQAP instruction). The validity of an
-APQN is defined by the matrix calculated from the APM and AQM; it is the
-cross product of all assigned adapter numbers (APM) with all assigned queue
-indexes (AQM). For example, if adapters 1 and 2 and usage domains 5 and 6 are
-assigned to a guest, the APQNs (1,5), (1,6), (2,5) and (2,6) will be valid for
-the guest.
-
-The APQNs can provide secure key functionality - i.e., a private key is stored
-on the adapter card for each of its domains - so each APQN must be assigned to
-at most one guest or the linux host.
-
- Example 1: Valid configuration:
- ------------------------------
- Guest1: adapters 1,2 domains 5,6
- Guest2: adapter 1,2 domain 7
-
- This is valid because both guests have a unique set of APQNs: Guest1 has
- APQNs (1,5), (1,6), (2,5) and (2,6); Guest2 has APQNs (1,7) and (2,7).
-
- Example 2: Valid configuration:
- ------------------------------
- Guest1: adapters 1,2 domains 5,6
- Guest2: adapters 3,4 domains 5,6
-
- This is also valid because both guests have a unique set of APQNs:
- Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
- Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
-
- Example 3: Invalid configuration:
- --------------------------------
- Guest1: adapters 1,2 domains 5,6
- Guest2: adapter 1 domains 6,7
-
- This is an invalid configuration because both guests have access to
- APQN (1,6).
-
-AP Matrix Configuration on Linux Host:
-=====================================
-A linux system is a guest of the LPAR in which it is running and has access to
-the AP resources configured for the LPAR. The LPAR's AP matrix is
-configured via its Activation Profile which can be edited on the HMC. When the
-linux system is started, the AP bus will detect the AP devices assigned to the
-LPAR and create the following in sysfs:
-
-/sys/bus/ap
-... [devices]
-...... xx.yyyy
-...... ...
-...... cardxx
-...... ...
-
-Where:
- cardxx is AP adapter number xx (in hex)
-....xx.yyyy is an APQN with xx specifying the APID and yyyy specifying the
- APQI
-
-For example, if AP adapters 5 and 6 and domains 4, 71 (0x47), 171 (0xab) and
-255 (0xff) are configured for the LPAR, the sysfs representation on the linux
-host system would look like this:
-
-/sys/bus/ap
-... [devices]
-...... 05.0004
-...... 05.0047
-...... 05.00ab
-...... 05.00ff
-...... 06.0004
-...... 06.0047
-...... 06.00ab
-...... 06.00ff
-...... card05
-...... card06
-
-A set of default device drivers are also created to control each type of AP
-device that can be assigned to the LPAR on which a linux host is running:
-
-/sys/bus/ap
-... [drivers]
-...... [cex2acard] for Crypto Express 2/3 accelerator cards
-...... [cex2aqueue] for AP queues served by Crypto Express 2/3
- accelerator cards
-...... [cex4card] for Crypto Express 4/5/6 accelerator and coprocessor
- cards
-...... [cex4queue] for AP queues served by Crypto Express 4/5/6
- accelerator and coprocessor cards
-...... [pcixcccard] for Crypto Express 2/3 coprocessor cards
-...... [pcixccqueue] for AP queues served by Crypto Express 2/3
- coprocessor cards
-
-Binding AP devices to device drivers
-------------------------------------
-There are two sysfs files that specify bitmasks marking a subset of the APQN
-range as 'usable by the default AP queue device drivers' or 'not usable by the
-default device drivers' and thus available for use by the alternate device
-driver(s). The sysfs locations of the masks are:
-
- /sys/bus/ap/apmask
- /sys/bus/ap/aqmask
-
- The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
- (APID). Each bit in the mask, from left to right (i.e., from most significant
- to least significant bit in big endian order), corresponds to an APID from
- 0-255. If a bit is set, the APID is marked as usable only by the default AP
- queue device drivers; otherwise, the APID is usable by the vfio_ap
- device driver.
-
- The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
- (APQI). Each bit in the mask, from left to right (i.e., from most significant
- to least significant bit in big endian order), corresponds to an APQI from
- 0-255. If a bit is set, the APQI is marked as usable only by the default AP
- queue device drivers; otherwise, the APQI is usable by the vfio_ap device
- driver.
-
- Take, for example, the following mask:
-
- 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-
- It indicates:
-
- 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
- belong to the vfio_ap device driver's pool.
-
- The APQN of each AP queue device assigned to the linux host is checked by the
- AP bus against the set of APQNs derived from the cross product of APIDs
- and APQIs marked as usable only by the default AP queue device drivers. If a
- match is detected, only the default AP queue device drivers will be probed;
- otherwise, the vfio_ap device driver will be probed.
-
- By default, the two masks are set to reserve all APQNs for use by the default
- AP queue device drivers. There are two ways the default masks can be changed:
-
- 1. The sysfs mask files can be edited by echoing a string into the
- respective sysfs mask file in one of two formats:
-
- * An absolute hex string starting with 0x - like "0x12345678" - sets
- the mask. If the given string is shorter than the mask, it is padded
- with 0s on the right; for example, specifying a mask value of 0x41 is
- the same as specifying:
-
- 0x4100000000000000000000000000000000000000000000000000000000000000
-
- Keep in mind that the mask reads from left to right (i.e., most
- significant to least significant bit in big endian order), so the mask
- above identifies device numbers 1 and 7 (01000001).
-
- If the string is longer than the mask, the operation is terminated with
- an error (EINVAL).
-
- * Individual bits in the mask can be switched on and off by specifying
- each bit number to be switched in a comma separated list. Each bit
- number string must be prepended with a ('+') or minus ('-') to indicate
- the corresponding bit is to be switched on ('+') or off ('-'). Some
- valid values are:
-
- "+0" switches bit 0 on
- "-13" switches bit 13 off
- "+0x41" switches bit 65 on
- "-0xff" switches bit 255 off
-
- The following example:
- +0,-6,+0x47,-0xf0
-
- Switches bits 0 and 71 (0x47) on
- Switches bits 6 and 240 (0xf0) off
-
- Note that the bits not specified in the list remain as they were before
- the operation.
-
- 2. The masks can also be changed at boot time via parameters on the kernel
- command line like this:
-
- ap.apmask=0xffff ap.aqmask=0x40
-
- This would create the following masks:
-
- apmask:
- 0xffff000000000000000000000000000000000000000000000000000000000000
-
- aqmask:
- 0x4000000000000000000000000000000000000000000000000000000000000000
-
- Resulting in these two pools:
-
- default drivers pool: adapter 0-15, domain 1
- alternate drivers pool: adapter 16-255, domains 0, 2-255
-
-Configuring an AP matrix for a linux guest.
-------------------------------------------
-The sysfs interfaces for configuring an AP matrix for a guest are built on the
-VFIO mediated device framework. To configure an AP matrix for a guest, a
-mediated matrix device must first be created for the /sys/devices/vfio_ap/matrix
-device. When the vfio_ap device driver is loaded, it registers with the VFIO
-mediated device framework. When the driver registers, the sysfs interfaces for
-creating mediated matrix devices is created:
-
-/sys/devices
-... [vfio_ap]
-......[matrix]
-......... [mdev_supported_types]
-............ [vfio_ap-passthrough]
-............... create
-............... [devices]
-
-A mediated AP matrix device is created by writing a UUID to the attribute file
-named 'create', for example:
-
- uuidgen > create
-
- or
-
- echo $uuid > create
-
-When a mediated AP matrix device is created, a sysfs directory named after
-the UUID is created in the 'devices' subdirectory:
-
-/sys/devices
-... [vfio_ap]
-......[matrix]
-......... [mdev_supported_types]
-............ [vfio_ap-passthrough]
-............... create
-............... [devices]
-.................. [$uuid]
-
-There will also be three sets of attribute files created in the mediated
-matrix device's sysfs directory to configure an AP matrix for the
-KVM guest:
-
-/sys/devices
-... [vfio_ap]
-......[matrix]
-......... [mdev_supported_types]
-............ [vfio_ap-passthrough]
-............... create
-............... [devices]
-.................. [$uuid]
-..................... assign_adapter
-..................... assign_control_domain
-..................... assign_domain
-..................... matrix
-..................... unassign_adapter
-..................... unassign_control_domain
-..................... unassign_domain
-
-assign_adapter
- To assign an AP adapter to the mediated matrix device, its APID is written
- to the 'assign_adapter' file. This may be done multiple times to assign more
- than one adapter. The APID may be specified using conventional semantics
- as a decimal, hexadecimal, or octal number. For example, to assign adapters
- 4, 5 and 16 to a mediated matrix device in decimal, hexadecimal and octal
- respectively:
-
- echo 4 > assign_adapter
- echo 0x5 > assign_adapter
- echo 020 > assign_adapter
-
- In order to successfully assign an adapter:
-
- * The adapter number specified must represent a value from 0 up to the
- maximum adapter number allowed by the machine model. If an adapter number
- higher than the maximum is specified, the operation will terminate with
- an error (ENODEV).
-
- * All APQNs that can be derived from the adapter ID being assigned and the
- IDs of the previously assigned domains must be bound to the vfio_ap device
- driver. If no domains have yet been assigned, then there must be at least
- one APQN with the specified APID bound to the vfio_ap driver. If no such
- APQNs are bound to the driver, the operation will terminate with an
- error (EADDRNOTAVAIL).
-
- No APQN that can be derived from the adapter ID and the IDs of the
- previously assigned domains can be assigned to another mediated matrix
- device. If an APQN is assigned to another mediated matrix device, the
- operation will terminate with an error (EADDRINUSE).
-
-unassign_adapter
- To unassign an AP adapter, its APID is written to the 'unassign_adapter'
- file. This may also be done multiple times to unassign more than one adapter.
-
-assign_domain
- To assign a usage domain, the domain number is written into the
- 'assign_domain' file. This may be done multiple times to assign more than one
- usage domain. The domain number is specified using conventional semantics as
- a decimal, hexadecimal, or octal number. For example, to assign usage domains
- 4, 8, and 71 to a mediated matrix device in decimal, hexadecimal and octal
- respectively:
-
- echo 4 > assign_domain
- echo 0x8 > assign_domain
- echo 0107 > assign_domain
-
- In order to successfully assign a domain:
-
- * The domain number specified must represent a value from 0 up to the
- maximum domain number allowed by the machine model. If a domain number
- higher than the maximum is specified, the operation will terminate with
- an error (ENODEV).
-
- * All APQNs that can be derived from the domain ID being assigned and the IDs
- of the previously assigned adapters must be bound to the vfio_ap device
- driver. If no domains have yet been assigned, then there must be at least
- one APQN with the specified APQI bound to the vfio_ap driver. If no such
- APQNs are bound to the driver, the operation will terminate with an
- error (EADDRNOTAVAIL).
-
- No APQN that can be derived from the domain ID being assigned and the IDs
- of the previously assigned adapters can be assigned to another mediated
- matrix device. If an APQN is assigned to another mediated matrix device,
- the operation will terminate with an error (EADDRINUSE).
-
-unassign_domain
- To unassign a usage domain, the domain number is written into the
- 'unassign_domain' file. This may be done multiple times to unassign more than
- one usage domain.
-
-assign_control_domain
- To assign a control domain, the domain number is written into the
- 'assign_control_domain' file. This may be done multiple times to
- assign more than one control domain. The domain number may be specified using
- conventional semantics as a decimal, hexadecimal, or octal number. For
- example, to assign control domains 4, 8, and 71 to a mediated matrix device
- in decimal, hexadecimal and octal respectively:
-
- echo 4 > assign_domain
- echo 0x8 > assign_domain
- echo 0107 > assign_domain
-
- In order to successfully assign a control domain, the domain number
- specified must represent a value from 0 up to the maximum domain number
- allowed by the machine model. If a control domain number higher than the
- maximum is specified, the operation will terminate with an error (ENODEV).
-
-unassign_control_domain
- To unassign a control domain, the domain number is written into the
- 'unassign_domain' file. This may be done multiple times to unassign more than
- one control domain.
-
-Notes: No changes to the AP matrix will be allowed while a guest using
-the mediated matrix device is running. Attempts to assign an adapter,
-domain or control domain will be rejected and an error (EBUSY) returned.
-
-Starting a Linux Guest Configured with an AP Matrix:
-===================================================
-To provide a mediated matrix device for use by a guest, the following option
-must be specified on the QEMU command line:
-
- -device vfio_ap,sysfsdev=$path-to-mdev
-
-The sysfsdev parameter specifies the path to the mediated matrix device.
-There are a number of ways to specify this path:
-
-/sys/devices/vfio_ap/matrix/$uuid
-/sys/bus/mdev/devices/$uuid
-/sys/bus/mdev/drivers/vfio_mdev/$uuid
-/sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough/devices/$uuid
-
-When the linux guest is started, the guest will open the mediated
-matrix device's file descriptor to get information about the mediated matrix
-device. The vfio_ap device driver will update the APM, AQM, and ADM fields in
-the guest's CRYCB with the adapter, usage domain and control domains assigned
-via the mediated matrix device's sysfs attribute files. Programs running on the
-linux guest will then:
-
-1. Have direct access to the APQNs derived from the cross product of the AP
- adapter numbers (APID) and queue indexes (APQI) specified in the APM and AQM
- fields of the guests's CRYCB respectively. These APQNs identify the AP queues
- that are valid for use by the guest; meaning, AP commands can be sent by the
- guest to any of these queues for processing.
-
-2. Have authorization to process AP commands to change a control domain
- identified in the ADM field of the guest's CRYCB. The AP command must be sent
- to a valid APQN (see 1 above).
-
-CPU model features:
-
-Three CPU model features are available for controlling guest access to AP
-facilities:
-
-1. AP facilities feature
-
- The AP facilities feature indicates that AP facilities are installed on the
- guest. This feature will be exposed for use only if the AP facilities
- are installed on the host system. The feature is s390-specific and is
- represented as a parameter of the -cpu option on the QEMU command line:
-
- qemu-system-s390x -cpu $model,ap=on|off
-
- Where:
-
- $model is the CPU model defined for the guest (defaults to the model of
- the host system if not specified).
-
- ap=on|off indicates whether AP facilities are installed (on) or not
- (off). The default for CPU models zEC12 or newer
- is ap=on. AP facilities must be installed on the guest if a
- vfio-ap device (-device vfio-ap,sysfsdev=$path) is configured
- for the guest, or the guest will fail to start.
-
-2. Query Configuration Information (QCI) facility
-
- The QCI facility is used by the AP bus running on the guest to query the
- configuration of the AP facilities. This facility will be available
- only if the QCI facility is installed on the host system. The feature is
- s390-specific and is represented as a parameter of the -cpu option on the
- QEMU command line:
-
- qemu-system-s390x -cpu $model,apqci=on|off
-
- Where:
-
- $model is the CPU model defined for the guest
-
- apqci=on|off indicates whether the QCI facility is installed (on) or
- not (off). The default for CPU models zEC12 or newer
- is apqci=on; for older models, QCI will not be installed.
-
- If QCI is installed (apqci=on) but AP facilities are not
- (ap=off), an error message will be logged, but the guest
- will be allowed to start. It makes no sense to have QCI
- installed if the AP facilities are not; this is considered
- an invalid configuration.
-
- If the QCI facility is not installed, APQNs with an APQI
- greater than 15 will not be detected by the AP bus
- running on the guest.
-
-3. Adjunct Process Facility Test (APFT) facility
-
- The APFT facility is used by the AP bus running on the guest to test the
- AP facilities available for a given AP queue. This facility will be available
- only if the APFT facility is installed on the host system. The feature is
- s390-specific and is represented as a parameter of the -cpu option on the
- QEMU command line:
-
- qemu-system-s390x -cpu $model,apft=on|off
-
- Where:
-
- $model is the CPU model defined for the guest (defaults to the model of
- the host system if not specified).
-
- apft=on|off indicates whether the APFT facility is installed (on) or
- not (off). The default for CPU models zEC12 and
- newer is apft=on for older models, APFT will not be
- installed.
-
- If APFT is installed (apft=on) but AP facilities are not
- (ap=off), an error message will be logged, but the guest
- will be allowed to start. It makes no sense to have APFT
- installed if the AP facilities are not; this is considered
- an invalid configuration.
-
- It also makes no sense to turn APFT off because the AP bus
- running on the guest will not detect CEX4 and newer devices
- without it. Since only CEX4 and newer devices are supported
- for guest usage, no AP devices can be made accessible to a
- guest started without APFT installed.
-
-Hot plug a vfio-ap device into a running guest:
-==============================================
-Only one vfio-ap device can be attached to the virtual machine's ap-bus, so a
-vfio-ap device can be hot plugged if and only if no vfio-ap device is attached
-to the bus already, whether via the QEMU command line or a prior hot plug
-action.
-
-To hot plug a vfio-ap device, use the QEMU device_add command:
-
- (qemu) device_add vfio-ap,sysfsdev="$path-to-mdev"
-
- Where the '$path-to-mdev' value specifies the absolute path to a mediated
- device to which AP resources to be used by the guest have been assigned.
-
-Note that on Linux guests, the AP devices will be created in the
-/sys/bus/ap/devices directory when the AP bus subsequently performs its periodic
-scan, so there may be a short delay before the AP devices are accessible on the
-guest.
-
-The command will fail if:
-
-* A vfio-ap device has already been attached to the virtual machine's ap-bus.
-
-* The CPU model features for controlling guest access to AP facilities are not
- enabled (see 'CPU model features' subsection in the previous section).
-
-Hot unplug a vfio-ap device from a running guest:
-================================================
-A vfio-ap device can be unplugged from a running KVM guest if a vfio-ap device
-has been attached to the virtual machine's ap-bus via the QEMU command line
-or a prior hot plug action.
-
-To hot unplug a vfio-ap device, use the QEMU device_del command:
-
- (qemu) device_del vfio-ap,sysfsdev="$path-to-mdev"
-
- Where $path-to-mdev is the same as the path specified when the vfio-ap
- device was attached to the virtual machine's ap-bus.
-
-On a Linux guest, the AP devices will be removed from the /sys/bus/ap/devices
-directory on the guest when the AP bus subsequently performs its periodic scan,
-so there may be a short delay before the AP devices are no longer accessible by
-the guest.
-
-The command will fail if the $path-to-mdev specified on the device_del command
-does not match the value specified when the vfio-ap device was attached to
-the virtual machine's ap-bus.
-
-Example: Configure AP Matrixes for Three Linux Guests:
-=====================================================
-Let's now provide an example to illustrate how KVM guests may be given
-access to AP facilities. For this example, we will show how to configure
-three guests such that executing the lszcrypt command on the guests would
-look like this:
-
-Guest1
-------
-CARD.DOMAIN TYPE MODE
-------------------------------
-05 CEX5C CCA-Coproc
-05.0004 CEX5C CCA-Coproc
-05.00ab CEX5C CCA-Coproc
-06 CEX5A Accelerator
-06.0004 CEX5A Accelerator
-06.00ab CEX5C CCA-Coproc
-
-Guest2
-------
-CARD.DOMAIN TYPE MODE
-------------------------------
-05 CEX5A Accelerator
-05.0047 CEX5A Accelerator
-05.00ff CEX5A Accelerator (5,4), (5,171), (6,4), (6,171),
-
-Guest3
-------
-CARD.DOMAIN TYPE MODE
-------------------------------
-06 CEX5A Accelerator
-06.0047 CEX5A Accelerator
-06.00ff CEX5A Accelerator
-
-These are the steps:
-
-1. Install the vfio_ap module on the linux host. The dependency chain for the
- vfio_ap module is:
- * iommu
- * s390
- * zcrypt
- * vfio
- * vfio_mdev
- * vfio_mdev_device
- * KVM
-
- To build the vfio_ap module, the kernel build must be configured with the
- following Kconfig elements selected:
- * IOMMU_SUPPORT
- * S390
- * ZCRYPT
- * S390_AP_IOMMU
- * VFIO
- * VFIO_MDEV
- * VFIO_MDEV_DEVICE
- * KVM
-
- If using make menuconfig select the following to build the vfio_ap module:
- -> Device Drivers
- -> IOMMU Hardware Support
- select S390 AP IOMMU Support
- -> VFIO Non-Privileged userspace driver framework
- -> Mediated device driver framework
- -> VFIO driver for Mediated devices
- -> I/O subsystem
- -> VFIO support for AP devices
-
-2. Secure the AP queues to be used by the three guests so that the host can not
- access them. To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff,
- 06.0004, 06.0047, 06.00ab, and 06.00ff for use by the vfio_ap device driver,
- the corresponding APQNs must be removed from the default queue drivers pool
- as follows:
-
- echo -5,-6 > /sys/bus/ap/apmask
-
- echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
-
- This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
- 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
- sysfs directory for the vfio_ap device driver will now contain symbolic links
- to the AP queue devices bound to it:
-
- /sys/bus/ap
- ... [drivers]
- ...... [vfio_ap]
- ......... [05.0004]
- ......... [05.0047]
- ......... [05.00ab]
- ......... [05.00ff]
- ......... [06.0004]
- ......... [06.0047]
- ......... [06.00ab]
- ......... [06.00ff]
-
- Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
- can be bound to the vfio_ap device driver. The reason for this is to
- simplify the implementation by not needlessly complicating the design by
- supporting older devices that will go out of service in the relatively near
- future, and for which there are few older systems on which to test.
-
- The administrator, therefore, must take care to secure only AP queues that
- can be bound to the vfio_ap device driver. The device type for a given AP
- queue device can be read from the parent card's sysfs directory. For example,
- to see the hardware type of the queue 05.0004:
-
- cat /sys/bus/ap/devices/card05/hwtype
-
- The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
- vfio_ap device driver.
-
-3. Create the mediated devices needed to configure the AP matrixes for the
- three guests and to provide an interface to the vfio_ap driver for
- use by the guests:
-
- /sys/devices/vfio_ap/matrix/
- --- [mdev_supported_types]
- ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
- --------- create
- --------- [devices]
-
- To create the mediated devices for the three guests:
-
- uuidgen > create
- uuidgen > create
- uuidgen > create
-
- or
-
- echo $uuid1 > create
- echo $uuid2 > create
- echo $uuid3 > create
-
- This will create three mediated devices in the [devices] subdirectory named
- after the UUID used to create the mediated device. We'll call them $uuid1,
- $uuid2 and $uuid3 and this is the sysfs directory structure after creation:
-
- /sys/devices/vfio_ap/matrix/
- --- [mdev_supported_types]
- ------ [vfio_ap-passthrough]
- --------- [devices]
- ------------ [$uuid1]
- --------------- assign_adapter
- --------------- assign_control_domain
- --------------- assign_domain
- --------------- matrix
- --------------- unassign_adapter
- --------------- unassign_control_domain
- --------------- unassign_domain
-
- ------------ [$uuid2]
- --------------- assign_adapter
- --------------- assign_control_domain
- --------------- assign_domain
- --------------- matrix
- --------------- unassign_adapter
- ----------------unassign_control_domain
- ----------------unassign_domain
-
- ------------ [$uuid3]
- --------------- assign_adapter
- --------------- assign_control_domain
- --------------- assign_domain
- --------------- matrix
- --------------- unassign_adapter
- ----------------unassign_control_domain
- ----------------unassign_domain
-
-4. The administrator now needs to configure the matrixes for the mediated
- devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
-
- This is how the matrix is configured for Guest1:
-
- echo 5 > assign_adapter
- echo 6 > assign_adapter
- echo 4 > assign_domain
- echo 0xab > assign_domain
-
- Control domains can similarly be assigned using the assign_control_domain
- sysfs file.
-
- If a mistake is made configuring an adapter, domain or control domain,
- you can use the unassign_xxx interfaces to unassign the adapter, domain or
- control domain.
-
- To display the matrix configuration for Guest1:
-
- cat matrix
-
- The output will display the APQNs in the format xx.yyyy, where xx is
- the adapter number and yyyy is the domain number. The output for Guest1
- will look like this:
-
- 05.0004
- 05.00ab
- 06.0004
- 06.00ab
-
- This is how the matrix is configured for Guest2:
-
- echo 5 > assign_adapter
- echo 0x47 > assign_domain
- echo 0xff > assign_domain
-
- This is how the matrix is configured for Guest3:
-
- echo 6 > assign_adapter
- echo 0x47 > assign_domain
- echo 0xff > assign_domain
-
-5. Start Guest1:
-
- /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
- -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
-
-7. Start Guest2:
-
- /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
- -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
-
-7. Start Guest3:
-
- /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
- -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
-
-When the guest is shut down, the mediated matrix devices may be removed.
-
-Using our example again, to remove the mediated matrix device $uuid1:
-
- /sys/devices/vfio_ap/matrix/
- --- [mdev_supported_types]
- ------ [vfio_ap-passthrough]
- --------- [devices]
- ------------ [$uuid1]
- --------------- remove
-
-
- echo 1 > remove
-
- This will remove all of the mdev matrix device's sysfs structures including
- the mdev device itself. To recreate and reconfigure the mdev matrix device,
- all of the steps starting with step 3 will have to be performed again. Note
- that the remove will fail if a guest using the mdev is still running.
-
- It is not necessary to remove an mdev matrix device, but one may want to
- remove it if no guest will use it during the remaining lifetime of the linux
- host. If the mdev matrix device is removed, one may want to also reconfigure
- the pool of adapters and queues reserved for use by the default drivers.
-
-Limitations
-===========
-* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
- to the default drivers pool of a queue that is still assigned to a mediated
- device in use by a guest. It is incumbent upon the administrator to
- ensure there is no mediated device in use by a guest to which the APQN is
- assigned lest the host be given access to the private data of the AP queue
- device, such as a private key configured specifically for the guest.
-
-* Dynamically assigning AP resources to or unassigning AP resources from a
- mediated matrix device - see 'Configuring an AP matrix for a linux guest'
- section above - while a running guest is using it is currently not supported.
-
-* Live guest migration is not supported for guests using AP devices. If a guest
- is using AP devices, the vfio-ap device configured for the guest must be
- unplugged before migrating the guest (see 'Hot unplug a vfio-ap device from a
- running guest' section above.
*/
void qemu_ram_writeback(RAMBlock *block, ram_addr_t start, ram_addr_t length)
{
- void *addr = ramblock_ptr(block, start);
-
/* The requested range should fit in within the block range */
g_assert((start + length) <= block->used_length);
#ifdef CONFIG_LIBPMEM
/* The lack of support for pmem should not block the sync */
if (ramblock_is_pmem(block)) {
+ void *addr = ramblock_ptr(block, start);
pmem_persist(addr, length);
return;
}
* specified as persistent (or is not one) - use the msync.
* Less optimal but still achieves the same goal
*/
+ void *addr = ramblock_ptr(block, start);
if (qemu_msync(addr, length, block->fd)) {
warn_report("%s: failed to sync memory range: start: "
RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
#include "qemu-common.h"
#include "qemu/units.h"
#include "qemu/option.h"
+#include "monitor/qdev.h"
#include "qapi/error.h"
#include "hw/sysbus.h"
#include "hw/boards.h"
#include "qemu/error-report.h"
#include "qemu/module.h"
#include "hw/pci-host/gpex.h"
+#include "hw/virtio/virtio-pci.h"
#include "hw/arm/sysbus-fdt.h"
#include "hw/platform-bus.h"
#include "hw/qdev-properties.h"
#include "hw/mem/pc-dimm.h"
#include "hw/mem/nvdimm.h"
#include "hw/acpi/generic_event_device.h"
+#include "hw/virtio/virtio-iommu.h"
#define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
g_free(node);
}
+static void create_virtio_iommu_dt_bindings(VirtMachineState *vms, Error **errp)
+{
+ const char compat[] = "virtio,pci-iommu";
+ uint16_t bdf = vms->virtio_iommu_bdf;
+ char *node;
+
+ vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt);
+
+ node = g_strdup_printf("%s/virtio_iommu@%d", vms->pciehb_nodename, bdf);
+ qemu_fdt_add_subnode(vms->fdt, node);
+ qemu_fdt_setprop(vms->fdt, node, "compatible", compat, sizeof(compat));
+ qemu_fdt_setprop_sized_cells(vms->fdt, node, "reg",
+ 1, bdf << 8, 1, 0, 1, 0,
+ 1, 0, 1, 0);
+
+ qemu_fdt_setprop_cell(vms->fdt, node, "#iommu-cells", 1);
+ qemu_fdt_setprop_cell(vms->fdt, node, "phandle", vms->iommu_phandle);
+ g_free(node);
+
+ qemu_fdt_setprop_cells(vms->fdt, vms->pciehb_nodename, "iommu-map",
+ 0x0, vms->iommu_phandle, 0x0, bdf,
+ bdf + 1, vms->iommu_phandle, bdf + 1, 0xffff - bdf);
+}
+
static void create_pcie(VirtMachineState *vms)
{
hwaddr base_mmio = vms->memmap[VIRT_PCIE_MMIO].base;
}
}
- nodename = g_strdup_printf("/pcie@%" PRIx64, base);
+ nodename = vms->pciehb_nodename = g_strdup_printf("/pcie@%" PRIx64, base);
qemu_fdt_add_subnode(vms->fdt, nodename);
qemu_fdt_setprop_string(vms->fdt, nodename,
"compatible", "pci-host-ecam-generic");
if (vms->iommu) {
vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt);
- create_smmu(vms, pci->bus);
-
- qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map",
- 0x0, vms->iommu_phandle, 0x0, 0x10000);
+ switch (vms->iommu) {
+ case VIRT_IOMMU_SMMUV3:
+ create_smmu(vms, pci->bus);
+ qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map",
+ 0x0, vms->iommu_phandle, 0x0, 0x10000);
+ break;
+ default:
+ g_assert_not_reached();
+ }
}
-
- g_free(nodename);
}
static void create_platform_bus(VirtMachineState *vms)
if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
virt_memory_plug(hotplug_dev, dev, errp);
}
+ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+ PCIDevice *pdev = PCI_DEVICE(dev);
+
+ vms->iommu = VIRT_IOMMU_VIRTIO;
+ vms->virtio_iommu_bdf = pci_get_bdf(pdev);
+ create_virtio_iommu_dt_bindings(vms, errp);
+ }
}
static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
(object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM))) {
return HOTPLUG_HANDLER(machine);
}
+ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+ VirtMachineState *vms = VIRT_MACHINE(machine);
+ if (!vms->bootinfo.firmware_loaded || !acpi_enabled) {
+ return HOTPLUG_HANDLER(machine);
+ }
+ }
return NULL;
}
return;
}
+ xendev = dataplane->xendev;
+
aio_context_acquire(dataplane->ctx);
+ if (dataplane->event_channel) {
+ /* Only reason for failure is a NULL channel */
+ xen_device_set_event_channel_context(xendev, dataplane->event_channel,
+ qemu_get_aio_context(),
+ &error_abort);
+ }
/* Xen doesn't have multiple users for nodes, so this can't fail */
blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort);
aio_context_release(dataplane->ctx);
- xendev = dataplane->xendev;
+ /*
+ * Now that the context has been moved onto the main thread, cancel
+ * further processing.
+ */
+ qemu_bh_cancel(dataplane->bh);
if (dataplane->event_channel) {
Error *local_err = NULL;
}
dataplane->event_channel =
- xen_device_bind_event_channel(xendev, dataplane->ctx, event_channel,
+ xen_device_bind_event_channel(xendev, event_channel,
xen_block_dataplane_event, dataplane,
&local_err);
if (local_err) {
aio_context_acquire(dataplane->ctx);
/* If other users keep the BlockBackend in the iothread, that's ok */
blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
+ /* Only reason for failure is a NULL channel */
+ xen_device_set_event_channel_context(xendev, dataplane->event_channel,
+ dataplane->ctx, &error_abort);
aio_context_release(dataplane->ctx);
+
return;
stop:
s->connected = true;
s->dev.nvqs = s->num_queues;
- s->dev.vqs = s->vqs;
+ s->dev.vqs = s->vhost_vqs;
s->dev.vq_index = 0;
s->dev.backend_features = 0;
virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
sizeof(struct virtio_blk_config));
+ s->virtqs = g_new(VirtQueue *, s->num_queues);
for (i = 0; i < s->num_queues; i++) {
- virtio_add_queue(vdev, s->queue_size,
- vhost_user_blk_handle_output);
+ s->virtqs[i] = virtio_add_queue(vdev, s->queue_size,
+ vhost_user_blk_handle_output);
}
s->inflight = g_new0(struct vhost_inflight, 1);
- s->vqs = g_new0(struct vhost_virtqueue, s->num_queues);
+ s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
s->watch = 0;
s->connected = false;
return;
virtio_err:
- g_free(s->vqs);
+ g_free(s->vhost_vqs);
g_free(s->inflight);
+ for (i = 0; i < s->num_queues; i++) {
+ virtio_delete_queue(s->virtqs[i]);
+ }
+ g_free(s->virtqs);
virtio_cleanup(vdev);
vhost_user_cleanup(&s->vhost_user);
}
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VHostUserBlk *s = VHOST_USER_BLK(dev);
+ int i;
virtio_set_status(vdev, 0);
qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL,
NULL, NULL, NULL, false);
vhost_dev_cleanup(&s->dev);
vhost_dev_free_inflight(s->inflight);
- g_free(s->vqs);
+ g_free(s->vhost_vqs);
g_free(s->inflight);
+
+ for (i = 0; i < s->num_queues; i++) {
+ virtio_delete_queue(s->virtqs[i]);
+ }
+ g_free(s->virtqs);
virtio_cleanup(vdev);
vhost_user_cleanup(&s->vhost_user);
}
/* if not Linux load the address of the (short) IPL PSW */
ipl_psw = rom_ptr(4, 4);
if (ipl_psw) {
- pentry = be32_to_cpu(*ipl_psw) & 0x7fffffffUL;
+ pentry = be32_to_cpu(*ipl_psw) & PSW_MASK_SHORT_ADDR;
} else {
error_setg(&err, "Could not get IPL PSW");
goto error;
default y
depends on VIRTIO
+config VIRTIO_IOMMU
+ bool
+ default y
+ depends on VIRTIO
+
config VIRTIO_PCI
bool
default y if PCI_DEVICES
obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o
common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += virtio-pmem-pci.o
obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-pci.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
ifeq ($(CONFIG_VIRTIO_PCI),y)
obj-$(CONFIG_VIRTIO_INPUT) += virtio-input-pci.o
obj-$(CONFIG_VIRTIO_RNG) += virtio-rng-pci.o
obj-$(CONFIG_VIRTIO_BALLOON) += virtio-balloon-pci.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu-pci.o
obj-$(CONFIG_VIRTIO_9P) += virtio-9p-pci.o
obj-$(CONFIG_VIRTIO_SCSI) += virtio-scsi-pci.o
obj-$(CONFIG_VIRTIO_BLK) += virtio-blk-pci.o
virtio_mmio_guest_page(uint64_t size, int shift) "guest page size 0x%" PRIx64 " shift %d"
virtio_mmio_queue_write(uint64_t value, int max_size) "mmio_queue write 0x%" PRIx64 " max %d"
virtio_mmio_setting_irq(int level) "virtio_mmio setting IRQ %d"
+
+# hw/virtio/virtio-iommu.c
+virtio_iommu_device_reset(void) "reset!"
+virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64
+virtio_iommu_device_status(uint8_t status) "driver status = %d"
+virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" start=0x%"PRIx64" end=0x%"PRIx64" domain_range=%d probe_size=0x%x"
+virtio_iommu_set_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x"
+virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
+virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
+virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
+virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
+virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
+virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
+virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
+virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
+virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64
sizeof(struct virtio_fs_config));
/* Hiprio queue */
- virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
+ fs->hiprio_vq = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
/* Request queues */
+ fs->req_vqs = g_new(VirtQueue *, fs->conf.num_request_queues);
for (i = 0; i < fs->conf.num_request_queues; i++) {
- virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
+ fs->req_vqs[i] = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
}
/* 1 high prio queue, plus the number configured */
err_virtio:
vhost_user_cleanup(&fs->vhost_user);
+ virtio_delete_queue(fs->hiprio_vq);
+ for (i = 0; i < fs->conf.num_request_queues; i++) {
+ virtio_delete_queue(fs->req_vqs[i]);
+ }
+ g_free(fs->req_vqs);
virtio_cleanup(vdev);
g_free(fs->vhost_dev.vqs);
return;
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VHostUserFS *fs = VHOST_USER_FS(dev);
+ int i;
/* This will stop vhost backend if appropriate. */
vuf_set_status(vdev, 0);
vhost_user_cleanup(&fs->vhost_user);
+ virtio_delete_queue(fs->hiprio_vq);
+ for (i = 0; i < fs->conf.num_request_queues; i++) {
+ virtio_delete_queue(fs->req_vqs[i]);
+ }
+ g_free(fs->req_vqs);
virtio_cleanup(vdev);
g_free(fs->vhost_dev.vqs);
fs->vhost_dev.vqs = NULL;
&offset);
fd = memory_region_get_fd(mr);
if (fd > 0) {
+ assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
trace_vhost_user_set_mem_table_withfd(fd_num, mr->name,
reg->memory_size,
reg->guest_phys_addr,
msg.payload.memory.regions[fd_num].guest_phys_addr =
reg->guest_phys_addr;
msg.payload.memory.regions[fd_num].mmap_offset = offset;
- assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
fds[fd_num++] = fd;
} else {
u->region_rb_offset[i] = 0;
"VHOST_USER_PROTOCOL_F_LOG_SHMFD feature.");
}
- err = vhost_setup_slave_channel(dev);
- if (err < 0) {
- return err;
+ if (dev->vq_index == 0) {
+ err = vhost_setup_slave_channel(dev);
+ if (err < 0) {
+ return err;
+ }
}
u->postcopy_notifier.notify = vhost_user_postcopy_notifier;
max_queues = vcrypto->multiqueue ? vcrypto->max_queues : 1;
for (i = 0; i < max_queues; i++) {
- virtio_del_queue(vdev, i);
+ virtio_delete_queue(vcrypto->vqs[i].dataq);
q = &vcrypto->vqs[i];
qemu_bh_delete(q->dataq_bh);
}
g_free(vcrypto->vqs);
+ virtio_delete_queue(vcrypto->ctrl_vq);
virtio_cleanup(vdev);
cryptodev_backend_set_used(vcrypto->cryptodev, false);
--- /dev/null
+/*
+ * Virtio IOMMU PCI Bindings
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ * Written by Eric Auger
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 or
+ * (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "virtio-pci.h"
+#include "hw/virtio/virtio-iommu.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "hw/boards.h"
+
+typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI;
+
+/*
+ * virtio-iommu-pci: This extends VirtioPCIProxy.
+ *
+ */
+#define VIRTIO_IOMMU_PCI(obj) \
+ OBJECT_CHECK(VirtIOIOMMUPCI, (obj), TYPE_VIRTIO_IOMMU_PCI)
+
+struct VirtIOIOMMUPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOIOMMU vdev;
+};
+
+static Property virtio_iommu_pci_properties[] = {
+ DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ if (!qdev_get_machine_hotplug_handler(DEVICE(vpci_dev))) {
+ MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
+
+ error_setg(errp,
+ "%s machine fails to create iommu-map device tree bindings",
+ mc->name);
+ error_append_hint(errp,
+ "Check you machine implements a hotplug handler "
+ "for the virtio-iommu-pci device\n");
+ error_append_hint(errp, "Check the guest is booted without FW or with "
+ "-no-acpi\n");
+ return;
+ }
+ qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+ object_property_set_link(OBJECT(dev),
+ OBJECT(pci_get_bus(&vpci_dev->pci_dev)),
+ "primary-bus", errp);
+ object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void virtio_iommu_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = virtio_iommu_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ device_class_set_props(dc, virtio_iommu_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_IOMMU;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+ dc->hotpluggable = false;
+}
+
+static void virtio_iommu_pci_instance_init(Object *obj)
+{
+ VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_IOMMU);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_iommu_pci_info = {
+ .base_name = TYPE_VIRTIO_IOMMU_PCI,
+ .generic_name = "virtio-iommu-pci",
+ .transitional_name = "virtio-iommu-pci-transitional",
+ .non_transitional_name = "virtio-iommu-pci-non-transitional",
+ .instance_size = sizeof(VirtIOIOMMUPCI),
+ .instance_init = virtio_iommu_pci_instance_init,
+ .class_init = virtio_iommu_pci_class_init,
+};
+
+static void virtio_iommu_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_iommu_pci_info);
+}
+
+type_init(virtio_iommu_pci_register)
+
+
--- /dev/null
+/*
+ * virtio-iommu device
+ *
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/iov.h"
+#include "qemu-common.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio.h"
+#include "sysemu/kvm.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+#include "standard-headers/linux/virtio_ids.h"
+
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+
+/* Max size */
+#define VIOMMU_DEFAULT_QUEUE_SIZE 256
+
+typedef struct VirtIOIOMMUDomain {
+ uint32_t id;
+ GTree *mappings;
+ QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
+} VirtIOIOMMUDomain;
+
+typedef struct VirtIOIOMMUEndpoint {
+ uint32_t id;
+ VirtIOIOMMUDomain *domain;
+ QLIST_ENTRY(VirtIOIOMMUEndpoint) next;
+} VirtIOIOMMUEndpoint;
+
+typedef struct VirtIOIOMMUInterval {
+ uint64_t low;
+ uint64_t high;
+} VirtIOIOMMUInterval;
+
+typedef struct VirtIOIOMMUMapping {
+ uint64_t phys_addr;
+ uint32_t flags;
+} VirtIOIOMMUMapping;
+
+static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
+{
+ return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
+}
+
+/**
+ * The bus number is used for lookup when SID based operations occur.
+ * In that case we lazily populate the IOMMUPciBus array from the bus hash
+ * table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus
+ * numbers may not be always initialized yet.
+ */
+static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num)
+{
+ IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num];
+
+ if (!iommu_pci_bus) {
+ GHashTableIter iter;
+
+ g_hash_table_iter_init(&iter, s->as_by_busptr);
+ while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
+ if (pci_bus_num(iommu_pci_bus->bus) == bus_num) {
+ s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus;
+ return iommu_pci_bus;
+ }
+ }
+ return NULL;
+ }
+ return iommu_pci_bus;
+}
+
+static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid)
+{
+ uint8_t bus_n, devfn;
+ IOMMUPciBus *iommu_pci_bus;
+ IOMMUDevice *dev;
+
+ bus_n = PCI_BUS_NUM(sid);
+ iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n);
+ if (iommu_pci_bus) {
+ devfn = sid & PCI_DEVFN_MAX;
+ dev = iommu_pci_bus->pbdev[devfn];
+ if (dev) {
+ return &dev->iommu_mr;
+ }
+ }
+ return NULL;
+}
+
+static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+ VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a;
+ VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b;
+
+ if (inta->high < intb->low) {
+ return -1;
+ } else if (intb->high < inta->low) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
+{
+ if (!ep->domain) {
+ return;
+ }
+ QLIST_REMOVE(ep, next);
+ ep->domain = NULL;
+}
+
+static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
+ uint32_t ep_id)
+{
+ VirtIOIOMMUEndpoint *ep;
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
+ if (ep) {
+ return ep;
+ }
+ if (!virtio_iommu_mr(s, ep_id)) {
+ return NULL;
+ }
+ ep = g_malloc0(sizeof(*ep));
+ ep->id = ep_id;
+ trace_virtio_iommu_get_endpoint(ep_id);
+ g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
+ return ep;
+}
+
+static void virtio_iommu_put_endpoint(gpointer data)
+{
+ VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data;
+
+ if (ep->domain) {
+ virtio_iommu_detach_endpoint_from_domain(ep);
+ }
+
+ trace_virtio_iommu_put_endpoint(ep->id);
+ g_free(ep);
+}
+
+static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s,
+ uint32_t domain_id)
+{
+ VirtIOIOMMUDomain *domain;
+
+ domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+ if (domain) {
+ return domain;
+ }
+ domain = g_malloc0(sizeof(*domain));
+ domain->id = domain_id;
+ domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
+ NULL, (GDestroyNotify)g_free,
+ (GDestroyNotify)g_free);
+ g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
+ QLIST_INIT(&domain->endpoint_list);
+ trace_virtio_iommu_get_domain(domain_id);
+ return domain;
+}
+
+static void virtio_iommu_put_domain(gpointer data)
+{
+ VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data;
+ VirtIOIOMMUEndpoint *iter, *tmp;
+
+ QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) {
+ virtio_iommu_detach_endpoint_from_domain(iter);
+ }
+ g_tree_destroy(domain->mappings);
+ trace_virtio_iommu_put_domain(domain->id);
+ g_free(domain);
+}
+
+static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
+ int devfn)
+{
+ VirtIOIOMMU *s = opaque;
+ IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
+ static uint32_t mr_index;
+ IOMMUDevice *sdev;
+
+ if (!sbus) {
+ sbus = g_malloc0(sizeof(IOMMUPciBus) +
+ sizeof(IOMMUDevice *) * PCI_DEVFN_MAX);
+ sbus->bus = bus;
+ g_hash_table_insert(s->as_by_busptr, bus, sbus);
+ }
+
+ sdev = sbus->pbdev[devfn];
+ if (!sdev) {
+ char *name = g_strdup_printf("%s-%d-%d",
+ TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ mr_index++, devfn);
+ sdev = sbus->pbdev[devfn] = g_malloc0(sizeof(IOMMUDevice));
+
+ sdev->viommu = s;
+ sdev->bus = bus;
+ sdev->devfn = devfn;
+
+ trace_virtio_iommu_init_iommu_mr(name);
+
+ memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
+ TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ OBJECT(s), name,
+ UINT64_MAX);
+ address_space_init(&sdev->as,
+ MEMORY_REGION(&sdev->iommu_mr), TYPE_VIRTIO_IOMMU);
+ g_free(name);
+ }
+ return &sdev->as;
+}
+
+static int virtio_iommu_attach(VirtIOIOMMU *s,
+ struct virtio_iommu_req_attach *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint32_t ep_id = le32_to_cpu(req->endpoint);
+ VirtIOIOMMUDomain *domain;
+ VirtIOIOMMUEndpoint *ep;
+
+ trace_virtio_iommu_attach(domain_id, ep_id);
+
+ ep = virtio_iommu_get_endpoint(s, ep_id);
+ if (!ep) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ if (ep->domain) {
+ VirtIOIOMMUDomain *previous_domain = ep->domain;
+ /*
+ * the device is already attached to a domain,
+ * detach it first
+ */
+ virtio_iommu_detach_endpoint_from_domain(ep);
+ if (QLIST_EMPTY(&previous_domain->endpoint_list)) {
+ g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id));
+ }
+ }
+
+ domain = virtio_iommu_get_domain(s, domain_id);
+ QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
+
+ ep->domain = domain;
+
+ return VIRTIO_IOMMU_S_OK;
+}
+
+static int virtio_iommu_detach(VirtIOIOMMU *s,
+ struct virtio_iommu_req_detach *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint32_t ep_id = le32_to_cpu(req->endpoint);
+ VirtIOIOMMUDomain *domain;
+ VirtIOIOMMUEndpoint *ep;
+
+ trace_virtio_iommu_detach(domain_id, ep_id);
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
+ if (!ep) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ domain = ep->domain;
+
+ if (!domain || domain->id != domain_id) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ virtio_iommu_detach_endpoint_from_domain(ep);
+
+ if (QLIST_EMPTY(&domain->endpoint_list)) {
+ g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id));
+ }
+ return VIRTIO_IOMMU_S_OK;
+}
+
+static int virtio_iommu_map(VirtIOIOMMU *s,
+ struct virtio_iommu_req_map *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint64_t phys_start = le64_to_cpu(req->phys_start);
+ uint64_t virt_start = le64_to_cpu(req->virt_start);
+ uint64_t virt_end = le64_to_cpu(req->virt_end);
+ uint32_t flags = le32_to_cpu(req->flags);
+ VirtIOIOMMUDomain *domain;
+ VirtIOIOMMUInterval *interval;
+ VirtIOIOMMUMapping *mapping;
+
+ if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+ if (!domain) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ interval = g_malloc0(sizeof(*interval));
+
+ interval->low = virt_start;
+ interval->high = virt_end;
+
+ mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
+ if (mapping) {
+ g_free(interval);
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
+
+ mapping = g_malloc0(sizeof(*mapping));
+ mapping->phys_addr = phys_start;
+ mapping->flags = flags;
+
+ g_tree_insert(domain->mappings, interval, mapping);
+
+ return VIRTIO_IOMMU_S_OK;
+}
+
+static int virtio_iommu_unmap(VirtIOIOMMU *s,
+ struct virtio_iommu_req_unmap *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint64_t virt_start = le64_to_cpu(req->virt_start);
+ uint64_t virt_end = le64_to_cpu(req->virt_end);
+ VirtIOIOMMUMapping *iter_val;
+ VirtIOIOMMUInterval interval, *iter_key;
+ VirtIOIOMMUDomain *domain;
+ int ret = VIRTIO_IOMMU_S_OK;
+
+ trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
+
+ domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+ if (!domain) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+ interval.low = virt_start;
+ interval.high = virt_end;
+
+ while (g_tree_lookup_extended(domain->mappings, &interval,
+ (void **)&iter_key, (void**)&iter_val)) {
+ uint64_t current_low = iter_key->low;
+ uint64_t current_high = iter_key->high;
+
+ if (interval.low <= current_low && interval.high >= current_high) {
+ g_tree_remove(domain->mappings, iter_key);
+ trace_virtio_iommu_unmap_done(domain_id, current_low, current_high);
+ } else {
+ ret = VIRTIO_IOMMU_S_RANGE;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int virtio_iommu_iov_to_req(struct iovec *iov,
+ unsigned int iov_cnt,
+ void *req, size_t req_sz)
+{
+ size_t sz, payload_sz = req_sz - sizeof(struct virtio_iommu_req_tail);
+
+ sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz);
+ if (unlikely(sz != payload_sz)) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+ return 0;
+}
+
+#define virtio_iommu_handle_req(__req) \
+static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \
+ struct iovec *iov, \
+ unsigned int iov_cnt) \
+{ \
+ struct virtio_iommu_req_ ## __req req; \
+ int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); \
+ \
+ return ret ? ret : virtio_iommu_ ## __req(s, &req); \
+}
+
+virtio_iommu_handle_req(attach)
+virtio_iommu_handle_req(detach)
+virtio_iommu_handle_req(map)
+virtio_iommu_handle_req(unmap)
+
+static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
+ struct virtio_iommu_req_head head;
+ struct virtio_iommu_req_tail tail = {};
+ VirtQueueElement *elem;
+ unsigned int iov_cnt;
+ struct iovec *iov;
+ size_t sz;
+
+ for (;;) {
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ return;
+ }
+
+ if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) ||
+ iov_size(elem->out_sg, elem->out_num) < sizeof(head)) {
+ virtio_error(vdev, "virtio-iommu bad head/tail size");
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ break;
+ }
+
+ iov_cnt = elem->out_num;
+ iov = elem->out_sg;
+ sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head));
+ if (unlikely(sz != sizeof(head))) {
+ tail.status = VIRTIO_IOMMU_S_DEVERR;
+ goto out;
+ }
+ qemu_mutex_lock(&s->mutex);
+ switch (head.type) {
+ case VIRTIO_IOMMU_T_ATTACH:
+ tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt);
+ break;
+ case VIRTIO_IOMMU_T_DETACH:
+ tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt);
+ break;
+ case VIRTIO_IOMMU_T_MAP:
+ tail.status = virtio_iommu_handle_map(s, iov, iov_cnt);
+ break;
+ case VIRTIO_IOMMU_T_UNMAP:
+ tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt);
+ break;
+ default:
+ tail.status = VIRTIO_IOMMU_S_UNSUPP;
+ }
+ qemu_mutex_unlock(&s->mutex);
+
+out:
+ sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
+ &tail, sizeof(tail));
+ assert(sz == sizeof(tail));
+
+ virtqueue_push(vq, elem, sizeof(tail));
+ virtio_notify(vdev, vq);
+ g_free(elem);
+ }
+}
+
+static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
+ int flags, uint32_t endpoint,
+ uint64_t address)
+{
+ VirtIODevice *vdev = &viommu->parent_obj;
+ VirtQueue *vq = viommu->event_vq;
+ struct virtio_iommu_fault fault;
+ VirtQueueElement *elem;
+ size_t sz;
+
+ memset(&fault, 0, sizeof(fault));
+ fault.reason = reason;
+ fault.flags = cpu_to_le32(flags);
+ fault.endpoint = cpu_to_le32(endpoint);
+ fault.address = cpu_to_le64(address);
+
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+
+ if (!elem) {
+ error_report_once(
+ "no buffer available in event queue to report event");
+ return;
+ }
+
+ if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
+ virtio_error(vdev, "error buffer of wrong size");
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ return;
+ }
+
+ sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
+ &fault, sizeof(fault));
+ assert(sz == sizeof(fault));
+
+ trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
+ virtqueue_push(vq, elem, sz);
+ virtio_notify(vdev, vq);
+ g_free(elem);
+
+}
+
+static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
+ IOMMUAccessFlags flag,
+ int iommu_idx)
+{
+ IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+ VirtIOIOMMUInterval interval, *mapping_key;
+ VirtIOIOMMUMapping *mapping_value;
+ VirtIOIOMMU *s = sdev->viommu;
+ bool read_fault, write_fault;
+ VirtIOIOMMUEndpoint *ep;
+ uint32_t sid, flags;
+ bool bypass_allowed;
+ bool found;
+
+ interval.low = addr;
+ interval.high = addr + 1;
+
+ IOMMUTLBEntry entry = {
+ .target_as = &address_space_memory,
+ .iova = addr,
+ .translated_addr = addr,
+ .addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1,
+ .perm = IOMMU_NONE,
+ };
+
+ bypass_allowed = virtio_vdev_has_feature(&s->parent_obj,
+ VIRTIO_IOMMU_F_BYPASS);
+
+ sid = virtio_iommu_get_bdf(sdev);
+
+ trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
+ qemu_mutex_lock(&s->mutex);
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
+ if (!ep) {
+ if (!bypass_allowed) {
+ error_report_once("%s sid=%d is not known!!", __func__, sid);
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
+ VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ } else {
+ entry.perm = flag;
+ }
+ goto unlock;
+ }
+
+ if (!ep->domain) {
+ if (!bypass_allowed) {
+ error_report_once("%s %02x:%02x.%01x not attached to any domain",
+ __func__, PCI_BUS_NUM(sid),
+ PCI_SLOT(sid), PCI_FUNC(sid));
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
+ VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ } else {
+ entry.perm = flag;
+ }
+ goto unlock;
+ }
+
+ found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
+ (void **)&mapping_key,
+ (void **)&mapping_value);
+ if (!found) {
+ error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
+ __func__, addr, sid);
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+ VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ goto unlock;
+ }
+
+ read_fault = (flag & IOMMU_RO) &&
+ !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ);
+ write_fault = (flag & IOMMU_WO) &&
+ !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE);
+
+ flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0;
+ flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0;
+ if (flags) {
+ error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d",
+ __func__, addr, flag, mapping_value->flags);
+ flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS;
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+ flags | VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ goto unlock;
+ }
+ entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr;
+ entry.perm = flag;
+ trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
+
+unlock:
+ qemu_mutex_unlock(&s->mutex);
+ return entry;
+}
+
+static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data)
+{
+ VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
+ struct virtio_iommu_config *config = &dev->config;
+
+ trace_virtio_iommu_get_config(config->page_size_mask,
+ config->input_range.start,
+ config->input_range.end,
+ config->domain_range.end,
+ config->probe_size);
+ memcpy(config_data, &dev->config, sizeof(struct virtio_iommu_config));
+}
+
+static void virtio_iommu_set_config(VirtIODevice *vdev,
+ const uint8_t *config_data)
+{
+ struct virtio_iommu_config config;
+
+ memcpy(&config, config_data, sizeof(struct virtio_iommu_config));
+ trace_virtio_iommu_set_config(config.page_size_mask,
+ config.input_range.start,
+ config.input_range.end,
+ config.domain_range.end,
+ config.probe_size);
+}
+
+static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f,
+ Error **errp)
+{
+ VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
+
+ f |= dev->features;
+ trace_virtio_iommu_get_features(f);
+ return f;
+}
+
+static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+ guint ua = GPOINTER_TO_UINT(a);
+ guint ub = GPOINTER_TO_UINT(b);
+ return (ua > ub) - (ua < ub);
+}
+
+static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
+
+ virtio_init(vdev, "virtio-iommu", VIRTIO_ID_IOMMU,
+ sizeof(struct virtio_iommu_config));
+
+ memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
+
+ s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE,
+ virtio_iommu_handle_command);
+ s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
+
+ s->config.page_size_mask = TARGET_PAGE_MASK;
+ s->config.input_range.end = -1UL;
+ s->config.domain_range.end = 32;
+
+ virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX);
+ virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC);
+ virtio_add_feature(&s->features, VIRTIO_F_VERSION_1);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO);
+
+ qemu_mutex_init(&s->mutex);
+
+ s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
+
+ if (s->primary_bus) {
+ pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s);
+ } else {
+ error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
+ }
+}
+
+static void virtio_iommu_device_unrealize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
+
+ g_tree_destroy(s->domains);
+ g_tree_destroy(s->endpoints);
+
+ virtio_cleanup(vdev);
+}
+
+static void virtio_iommu_device_reset(VirtIODevice *vdev)
+{
+ VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
+
+ trace_virtio_iommu_device_reset();
+
+ if (s->domains) {
+ g_tree_destroy(s->domains);
+ }
+ if (s->endpoints) {
+ g_tree_destroy(s->endpoints);
+ }
+ s->domains = g_tree_new_full((GCompareDataFunc)int_cmp,
+ NULL, NULL, virtio_iommu_put_domain);
+ s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp,
+ NULL, NULL, virtio_iommu_put_endpoint);
+}
+
+static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ trace_virtio_iommu_device_status(status);
+}
+
+static void virtio_iommu_instance_init(Object *obj)
+{
+}
+
+#define VMSTATE_INTERVAL \
+{ \
+ .name = "interval", \
+ .version_id = 1, \
+ .minimum_version_id = 1, \
+ .fields = (VMStateField[]) { \
+ VMSTATE_UINT64(low, VirtIOIOMMUInterval), \
+ VMSTATE_UINT64(high, VirtIOIOMMUInterval), \
+ VMSTATE_END_OF_LIST() \
+ } \
+}
+
+#define VMSTATE_MAPPING \
+{ \
+ .name = "mapping", \
+ .version_id = 1, \
+ .minimum_version_id = 1, \
+ .fields = (VMStateField[]) { \
+ VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\
+ VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \
+ VMSTATE_END_OF_LIST() \
+ }, \
+}
+
+static const VMStateDescription vmstate_interval_mapping[2] = {
+ VMSTATE_MAPPING, /* value */
+ VMSTATE_INTERVAL /* key */
+};
+
+static int domain_preload(void *opaque)
+{
+ VirtIOIOMMUDomain *domain = opaque;
+
+ domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
+ NULL, g_free, g_free);
+ return 0;
+}
+
+static const VMStateDescription vmstate_endpoint = {
+ .name = "endpoint",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(id, VirtIOIOMMUEndpoint),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_domain = {
+ .name = "domain",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .pre_load = domain_preload,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(id, VirtIOIOMMUDomain),
+ VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1,
+ vmstate_interval_mapping,
+ VirtIOIOMMUInterval, VirtIOIOMMUMapping),
+ VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
+ vmstate_endpoint, VirtIOIOMMUEndpoint, next),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static gboolean reconstruct_endpoints(gpointer key, gpointer value,
+ gpointer data)
+{
+ VirtIOIOMMU *s = (VirtIOIOMMU *)data;
+ VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value;
+ VirtIOIOMMUEndpoint *iter;
+
+ QLIST_FOREACH(iter, &d->endpoint_list, next) {
+ iter->domain = d;
+ g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter);
+ }
+ return false; /* continue the domain traversal */
+}
+
+static int iommu_post_load(void *opaque, int version_id)
+{
+ VirtIOIOMMU *s = opaque;
+
+ g_tree_foreach(s->domains, reconstruct_endpoints, s);
+ return 0;
+}
+
+static const VMStateDescription vmstate_virtio_iommu_device = {
+ .name = "virtio-iommu-device",
+ .minimum_version_id = 1,
+ .version_id = 1,
+ .post_load = iommu_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 1,
+ &vmstate_domain, VirtIOIOMMUDomain),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static const VMStateDescription vmstate_virtio_iommu = {
+ .name = "virtio-iommu",
+ .minimum_version_id = 1,
+ .priority = MIG_PRI_IOMMU,
+ .version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static Property virtio_iommu_properties[] = {
+ DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, "PCI", PCIBus *),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_iommu_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, virtio_iommu_properties);
+ dc->vmsd = &vmstate_virtio_iommu;
+
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ vdc->realize = virtio_iommu_device_realize;
+ vdc->unrealize = virtio_iommu_device_unrealize;
+ vdc->reset = virtio_iommu_device_reset;
+ vdc->get_config = virtio_iommu_get_config;
+ vdc->set_config = virtio_iommu_set_config;
+ vdc->get_features = virtio_iommu_get_features;
+ vdc->set_status = virtio_iommu_set_status;
+ vdc->vmsd = &vmstate_virtio_iommu_device;
+}
+
+static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
+ void *data)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+ imrc->translate = virtio_iommu_translate;
+}
+
+static const TypeInfo virtio_iommu_info = {
+ .name = TYPE_VIRTIO_IOMMU,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIOIOMMU),
+ .instance_init = virtio_iommu_instance_init,
+ .class_init = virtio_iommu_class_init,
+};
+
+static const TypeInfo virtio_iommu_memory_region_info = {
+ .parent = TYPE_IOMMU_MEMORY_REGION,
+ .name = TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ .class_init = virtio_iommu_memory_region_class_init,
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_iommu_info);
+ type_register_static(&virtio_iommu_memory_region_info);
+}
+
+type_init(virtio_register_types)
VirtIOPMEM *pmem = VIRTIO_PMEM(dev);
host_memory_backend_set_mapped(pmem->memdev, false);
+ virtio_delete_queue(pmem->rq_vq);
virtio_cleanup(vdev);
}
/* Called within rcu_read_lock(). */
static VRingMemoryRegionCaches *vring_get_region_caches(struct VirtQueue *vq)
{
- VRingMemoryRegionCaches *caches = atomic_rcu_read(&vq->vring.caches);
- assert(caches != NULL);
- return caches;
+ return atomic_rcu_read(&vq->vring.caches);
}
+
/* Called within rcu_read_lock(). */
static inline uint16_t vring_avail_flags(VirtQueue *vq)
{
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
hwaddr pa = offsetof(VRingAvail, flags);
+
+ if (!caches) {
+ return 0;
+ }
+
return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
}
{
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
hwaddr pa = offsetof(VRingAvail, idx);
+
+ if (!caches) {
+ return 0;
+ }
+
vq->shadow_avail_idx = virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
return vq->shadow_avail_idx;
}
{
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
hwaddr pa = offsetof(VRingAvail, ring[i]);
+
+ if (!caches) {
+ return 0;
+ }
+
return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
}
{
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
hwaddr pa = offsetof(VRingUsed, ring[i]);
+
+ if (!caches) {
+ return;
+ }
+
virtio_tswap32s(vq->vdev, &uelem->id);
virtio_tswap32s(vq->vdev, &uelem->len);
address_space_write_cached(&caches->used, pa, uelem, sizeof(VRingUsedElem));
{
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
hwaddr pa = offsetof(VRingUsed, idx);
+
+ if (!caches) {
+ return 0;
+ }
+
return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
}
{
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
hwaddr pa = offsetof(VRingUsed, idx);
- virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val);
- address_space_cache_invalidate(&caches->used, pa, sizeof(val));
+
+ if (caches) {
+ virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val);
+ address_space_cache_invalidate(&caches->used, pa, sizeof(val));
+ }
+
vq->used_idx = val;
}
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
VirtIODevice *vdev = vq->vdev;
hwaddr pa = offsetof(VRingUsed, flags);
- uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
+ uint16_t flags;
+ if (!caches) {
+ return;
+ }
+
+ flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
virtio_stw_phys_cached(vdev, &caches->used, pa, flags | mask);
address_space_cache_invalidate(&caches->used, pa, sizeof(flags));
}
VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
VirtIODevice *vdev = vq->vdev;
hwaddr pa = offsetof(VRingUsed, flags);
- uint16_t flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
+ uint16_t flags;
+ if (!caches) {
+ return;
+ }
+
+ flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
virtio_stw_phys_cached(vdev, &caches->used, pa, flags & ~mask);
address_space_cache_invalidate(&caches->used, pa, sizeof(flags));
}
}
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return;
+ }
+
pa = offsetof(VRingUsed, ring[vq->vring.num]);
virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val);
address_space_cache_invalidate(&caches->used, pa, sizeof(val));
VRingMemoryRegionCaches *caches;
RCU_READ_LOCK_GUARD();
- caches = vring_get_region_caches(vq);
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return;
+ }
+
vring_packed_event_read(vq->vdev, &caches->used, &e);
if (!enable) {
}
cache = vring_get_region_caches(vq);
+ if (!cache) {
+ return 1;
+ }
+
vring_packed_desc_read_flags(vq->vdev, &desc.flags, &cache->desc,
vq->last_avail_idx);
}
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return;
+ }
+
vring_packed_desc_write(vq->vdev, &desc, &caches->desc, head, strict_order);
}
max = vq->vring.num;
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ goto err;
+ }
+
while ((rc = virtqueue_num_heads(vq, idx)) > 0) {
MemoryRegionCache *desc_cache = &caches->desc;
unsigned int num_bufs;
max = vq->vring.num;
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ goto err;
+ }
for (;;) {
unsigned int num_bufs = total_bufs;
}
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ goto err;
+ }
+
desc_size = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED) ?
sizeof(VRingPackedDesc) : sizeof(VRingDesc);
if (caches->desc.len < vq->vring.num * desc_size) {
i = head;
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ virtio_error(vdev, "Region caches not initialized");
+ goto done;
+ }
+
if (caches->desc.len < max * sizeof(VRingDesc)) {
virtio_error(vdev, "Cannot map descriptor ring");
goto done;
i = vq->last_avail_idx;
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ virtio_error(vdev, "Region caches not initialized");
+ goto done;
+ }
+
if (caches->desc.len < max * sizeof(VRingDesc)) {
virtio_error(vdev, "Cannot map descriptor ring");
goto done;
VRingPackedDesc desc;
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return 0;
+ }
+
desc_cache = &caches->desc;
virtio_queue_set_notification(vq, 0);
VRingMemoryRegionCaches *caches;
caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return false;
+ }
+
vring_packed_event_read(vdev, &caches->avail, &e);
old = vq->signalled_used;
}
}
+void xen_device_set_event_channel_context(XenDevice *xendev,
+ XenEventChannel *channel,
+ AioContext *ctx,
+ Error **errp)
+{
+ if (!channel) {
+ error_setg(errp, "bad channel");
+ return;
+ }
+
+ if (channel->ctx)
+ aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true,
+ NULL, NULL, NULL, NULL);
+
+ channel->ctx = ctx;
+ aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true,
+ xen_device_event, NULL, xen_device_poll, channel);
+}
+
XenEventChannel *xen_device_bind_event_channel(XenDevice *xendev,
- AioContext *ctx,
unsigned int port,
XenEventHandler handler,
void *opaque, Error **errp)
channel->handler = handler;
channel->opaque = opaque;
- channel->ctx = ctx;
- aio_set_fd_handler(channel->ctx, xenevtchn_fd(channel->xeh), true,
- xen_device_event, NULL, xen_device_poll, channel);
+ /* Only reason for failure is a NULL channel */
+ xen_device_set_event_channel_context(xendev, channel,
+ qemu_get_aio_context(),
+ &error_abort);
QLIST_INSERT_HEAD(&xendev->event_channels, channel, list);
*/
#include "qemu/osdep.h"
#include "qapi/error.h"
-#include "hw/i386/pc.h"
#include "qemu/error-report.h"
-#include "ui/console.h"
#include "hw/loader.h"
-#include "monitor/monitor.h"
-#include "qemu/range.h"
#include "hw/pci/pci.h"
#include "xen_pt.h"
bool virt;
int32_t gic_version;
VirtIOMMUType iommu;
+ uint16_t virtio_iommu_bdf;
struct arm_boot_info bootinfo;
MemMapEntry *memmap;
+ char *pciehb_nodename;
const int *irqmap;
int smp_cpus;
void *fdt;
#define PCI_DEVICE_ID_VIRTIO_9P 0x1009
#define PCI_DEVICE_ID_VIRTIO_VSOCK 0x1012
#define PCI_DEVICE_ID_VIRTIO_PMEM 0x1013
+#define PCI_DEVICE_ID_VIRTIO_IOMMU 0x1014
#define PCI_VENDOR_ID_REDHAT 0x1b36
#define PCI_DEVICE_ID_REDHAT_BRIDGE 0x0001
struct vhost_dev dev;
struct vhost_inflight *inflight;
VhostUserState vhost_user;
- struct vhost_virtqueue *vqs;
+ struct vhost_virtqueue *vhost_vqs;
+ VirtQueue **virtqs;
guint watch;
bool connected;
} VHostUserBlk;
struct vhost_virtqueue *vhost_vqs;
struct vhost_dev vhost_dev;
VhostUserState vhost_user;
+ VirtQueue **req_vqs;
+ VirtQueue *hiprio_vq;
/*< public >*/
} VHostUserFS;
--- /dev/null
+/*
+ * virtio-iommu device
+ *
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef QEMU_VIRTIO_IOMMU_H
+#define QEMU_VIRTIO_IOMMU_H
+
+#include "standard-headers/linux/virtio_iommu.h"
+#include "hw/virtio/virtio.h"
+#include "hw/pci/pci.h"
+
+#define TYPE_VIRTIO_IOMMU "virtio-iommu-device"
+#define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-device-base"
+#define VIRTIO_IOMMU(obj) \
+ OBJECT_CHECK(VirtIOIOMMU, (obj), TYPE_VIRTIO_IOMMU)
+
+#define TYPE_VIRTIO_IOMMU_MEMORY_REGION "virtio-iommu-memory-region"
+
+typedef struct IOMMUDevice {
+ void *viommu;
+ PCIBus *bus;
+ int devfn;
+ IOMMUMemoryRegion iommu_mr;
+ AddressSpace as;
+} IOMMUDevice;
+
+typedef struct IOMMUPciBus {
+ PCIBus *bus;
+ IOMMUDevice *pbdev[0]; /* Parent array is sparse, so dynamically alloc */
+} IOMMUPciBus;
+
+typedef struct VirtIOIOMMU {
+ VirtIODevice parent_obj;
+ VirtQueue *req_vq;
+ VirtQueue *event_vq;
+ struct virtio_iommu_config config;
+ uint64_t features;
+ GHashTable *as_by_busptr;
+ IOMMUPciBus *iommu_pcibus_by_bus_num[PCI_BUS_MAX];
+ PCIBus *primary_bus;
+ GTree *domains;
+ QemuMutex mutex;
+ GTree *endpoints;
+} VirtIOIOMMU;
+
+#endif
typedef bool (*XenEventHandler)(void *opaque);
XenEventChannel *xen_device_bind_event_channel(XenDevice *xendev,
- AioContext *ctx,
unsigned int port,
XenEventHandler handler,
void *opaque, Error **errp);
+void xen_device_set_event_channel_context(XenDevice *xendev,
+ XenEventChannel *channel,
+ AioContext *ctx,
+ Error **errp);
void xen_device_notify_event_channel(XenDevice *xendev,
XenEventChannel *channel,
Error **errp);
return (value >> start) & (~0U >> (32 - length));
}
+/**
+ * extract8:
+ * @value: the value to extract the bit field from
+ * @start: the lowest bit in the bit field (numbered from 0)
+ * @length: the length of the bit field
+ *
+ * Extract from the 8 bit input @value the bit field specified by the
+ * @start and @length parameters, and return it. The bit field must
+ * lie entirely within the 8 bit word. It is valid to request that
+ * all 8 bits are returned (ie @length 8 and @start 0).
+ *
+ * Returns: the value of the bit field extracted from the input value.
+ */
+static inline uint8_t extract8(uint8_t value, int start, int length)
+{
+ assert(start >= 0 && length > 0 && length <= 8 - start);
+ return extract32(value, start, length);
+}
+
+/**
+ * extract16:
+ * @value: the value to extract the bit field from
+ * @start: the lowest bit in the bit field (numbered from 0)
+ * @length: the length of the bit field
+ *
+ * Extract from the 16 bit input @value the bit field specified by the
+ * @start and @length parameters, and return it. The bit field must
+ * lie entirely within the 16 bit word. It is valid to request that
+ * all 16 bits are returned (ie @length 16 and @start 0).
+ *
+ * Returns: the value of the bit field extracted from the input value.
+ */
+static inline uint16_t extract16(uint16_t value, int start, int length)
+{
+ assert(start >= 0 && length > 0 && length <= 16 - start);
+ return extract32(value, start, length);
+}
+
/**
* extract64:
* @value: the value to extract the bit field from
#define I915_FORMAT_MOD_Y_TILED_CCS fourcc_mod_code(INTEL, 4)
#define I915_FORMAT_MOD_Yf_TILED_CCS fourcc_mod_code(INTEL, 5)
+/*
+ * Intel color control surfaces (CCS) for Gen-12 render compression.
+ *
+ * The main surface is Y-tiled and at plane index 0, the CCS is linear and
+ * at index 1. A 64B CCS cache line corresponds to an area of 4x1 tiles in
+ * main surface. In other words, 4 bits in CCS map to a main surface cache
+ * line pair. The main surface pitch is required to be a multiple of four
+ * Y-tile widths.
+ */
+#define I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS fourcc_mod_code(INTEL, 6)
+
+/*
+ * Intel color control surfaces (CCS) for Gen-12 media compression
+ *
+ * The main surface is Y-tiled and at plane index 0, the CCS is linear and
+ * at index 1. A 64B CCS cache line corresponds to an area of 4x1 tiles in
+ * main surface. In other words, 4 bits in CCS map to a main surface cache
+ * line pair. The main surface pitch is required to be a multiple of four
+ * Y-tile widths. For semi-planar formats like NV12, CCS planes follow the
+ * Y and UV planes i.e., planes 0 and 1 are used for Y and UV surfaces,
+ * planes 2 and 3 for the respective CCS.
+ */
+#define I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS fourcc_mod_code(INTEL, 7)
+
/*
* Tiled, NV12MT, grouped in 64 (pixels) x 32 (lines) -sized macroblocks
*
* @ETH_SS_RSS_HASH_FUNCS: RSS hush function names
* @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS
* @ETH_SS_PHY_TUNABLES: PHY tunable names
+ * @ETH_SS_LINK_MODES: link mode names
+ * @ETH_SS_MSG_CLASSES: debug message class names
+ * @ETH_SS_WOL_MODES: wake-on-lan modes
*/
enum ethtool_stringset {
ETH_SS_TEST = 0,
ETH_SS_TUNABLES,
ETH_SS_PHY_STATS,
ETH_SS_PHY_TUNABLES,
+ ETH_SS_LINK_MODES,
+ ETH_SS_MSG_CLASSES,
+ ETH_SS_WOL_MODES,
+
+ /* add new constants above here */
+ ETH_SS_COUNT
};
/**
#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */
#define WAKE_FILTER (1 << 7)
+#define WOL_MODE_COUNT 8
+
/* L2-L4 network traffic flow types */
#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */
#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */
unsigned long __sec;
#if defined(__sparc__) && defined(__arch64__)
unsigned int __usec;
+ unsigned int __pad;
#else
unsigned long __usec;
#endif
#define PCI_EXP_LNKCTL2_TLS_32_0GT 0x0005 /* Supported Speed 32GT/s */
#define PCI_EXP_LNKCTL2_ENTER_COMP 0x0010 /* Enter Compliance */
#define PCI_EXP_LNKCTL2_TX_MARGIN 0x0380 /* Transmit Margin */
+#define PCI_EXP_LNKCTL2_HASD 0x0020 /* HW Autonomous Speed Disable */
#define PCI_EXP_LNKSTA2 50 /* Link Status 2 */
#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */
#define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */
#define __NR_fspick (__NR_SYSCALL_BASE + 433)
#define __NR_pidfd_open (__NR_SYSCALL_BASE + 434)
#define __NR_clone3 (__NR_SYSCALL_BASE + 435)
+#define __NR_openat2 (__NR_SYSCALL_BASE + 437)
+#define __NR_pidfd_getfd (__NR_SYSCALL_BASE + 438)
#endif /* _ASM_ARM_UNISTD_COMMON_H */
#define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2)
#define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1)
-/* EL0 Virtual Timer Registers */
+/*
+ * EL0 Virtual Timer Registers
+ *
+ * WARNING:
+ * KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT are not defined
+ * with the appropriate register encodings. Their values have been
+ * accidentally swapped. As this is set API, the definitions here
+ * must be used, rather than ones derived from the encodings.
+ */
#define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1)
-#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2)
#define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2)
+#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2)
/* KVM-as-firmware specific pseudo-registers */
#define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT)
#define __ARCH_WANT_NEW_STAT
#define __ARCH_WANT_SET_GET_RLIMIT
#define __ARCH_WANT_TIME32_SYSCALLS
+#define __ARCH_WANT_SYS_CLONE3
#include <asm-generic/unistd.h>
#define PROT_WRITE 0x2 /* page can be written */
#define PROT_EXEC 0x4 /* page can be executed */
#define PROT_SEM 0x8 /* page may be used for atomic ops */
+/* 0x10 reserved for arch-specific use */
+/* 0x20 reserved for arch-specific use */
#define PROT_NONE 0x0 /* page can not be accessed */
#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */
#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */
__SYSCALL(__NR_clone3, sys_clone3)
#endif
+#define __NR_openat2 437
+__SYSCALL(__NR_openat2, sys_openat2)
+#define __NR_pidfd_getfd 438
+__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
+
#undef __NR_syscalls
-#define __NR_syscalls 436
+#define __NR_syscalls 439
/*
* 32 bit systems traditionally used different
#define __NR_fspick (__NR_Linux + 433)
#define __NR_pidfd_open (__NR_Linux + 434)
#define __NR_clone3 (__NR_Linux + 435)
+#define __NR_openat2 (__NR_Linux + 437)
+#define __NR_pidfd_getfd (__NR_Linux + 438)
#endif /* _ASM_MIPS_UNISTD_N32_H */
#define __NR_fspick (__NR_Linux + 433)
#define __NR_pidfd_open (__NR_Linux + 434)
#define __NR_clone3 (__NR_Linux + 435)
+#define __NR_openat2 (__NR_Linux + 437)
+#define __NR_pidfd_getfd (__NR_Linux + 438)
#endif /* _ASM_MIPS_UNISTD_N64_H */
#define __NR_fspick (__NR_Linux + 433)
#define __NR_pidfd_open (__NR_Linux + 434)
#define __NR_clone3 (__NR_Linux + 435)
+#define __NR_openat2 (__NR_Linux + 437)
+#define __NR_pidfd_getfd (__NR_Linux + 438)
#endif /* _ASM_MIPS_UNISTD_O32_H */
#define __NR_fspick 433
#define __NR_pidfd_open 434
#define __NR_clone3 435
+#define __NR_openat2 437
+#define __NR_pidfd_getfd 438
#endif /* _ASM_POWERPC_UNISTD_32_H */
#define __NR_fspick 433
#define __NR_pidfd_open 434
#define __NR_clone3 435
+#define __NR_openat2 437
+#define __NR_pidfd_getfd 438
#endif /* _ASM_POWERPC_UNISTD_64_H */
#define __NR_fspick 433
#define __NR_pidfd_open 434
#define __NR_clone3 435
+#define __NR_openat2 437
+#define __NR_pidfd_getfd 438
#endif /* _ASM_S390_UNISTD_32_H */
#define __NR_fspick 433
#define __NR_pidfd_open 434
#define __NR_clone3 435
+#define __NR_openat2 437
+#define __NR_pidfd_getfd 438
#endif /* _ASM_S390_UNISTD_64_H */
#define __NR_fspick 433
#define __NR_pidfd_open 434
#define __NR_clone3 435
+#define __NR_openat2 437
+#define __NR_pidfd_getfd 438
#endif /* _ASM_X86_UNISTD_32_H */
#define __NR_fspick 433
#define __NR_pidfd_open 434
#define __NR_clone3 435
+#define __NR_openat2 437
+#define __NR_pidfd_getfd 438
#endif /* _ASM_X86_UNISTD_64_H */
#define __NR_fspick (__X32_SYSCALL_BIT + 433)
#define __NR_pidfd_open (__X32_SYSCALL_BIT + 434)
#define __NR_clone3 (__X32_SYSCALL_BIT + 435)
+#define __NR_openat2 (__X32_SYSCALL_BIT + 437)
+#define __NR_pidfd_getfd (__X32_SYSCALL_BIT + 438)
#define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512)
#define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513)
#define __NR_ioctl (__X32_SYSCALL_BIT + 514)
#define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
#define KVM_CAP_ARM_NISV_TO_USER 177
#define KVM_CAP_ARM_INJECT_EXT_DABT 178
+#define KVM_CAP_S390_VCPU_RESETS 179
#ifdef KVM_CAP_IRQ_ROUTING
/* Available with KVM_CAP_ARM_SVE */
#define KVM_ARM_VCPU_FINALIZE _IOW(KVMIO, 0xc2, int)
+/* Available with KVM_CAP_S390_VCPU_RESETS */
+#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3)
+#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4)
+
/* Secure Encrypted Virtualization command */
enum sev_cmd_id {
/* Guest initialization commands */
!client->export_meta.bitmap,
NBD_META_ID_BASE_ALLOCATION,
errp);
- } else { /* client->export_meta.bitmap */
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ if (client->export_meta.bitmap) {
ret = nbd_co_send_bitmap(client, request->handle,
client->exp->export_bitmap,
request->from, request->len,
dont_fragment,
true, NBD_META_ID_DIRTY_BITMAP, errp);
+ if (ret < 0) {
+ return ret;
+ }
}
- return ret;
+ return 0;
} else {
return nbd_send_generic_reply(client, request->handle, -EINVAL,
"CMD_BLOCK_STATUS not negotiated",
switch (flags) {
case QEMU_PLUGIN_CB_RW_REGS:
ret = 0;
+ break;
case QEMU_PLUGIN_CB_R_REGS:
ret = TCG_CALL_NO_WG;
break;
{ "virtio-input-host-ccw", "virtio-input-host", QEMU_ARCH_S390X },
{ "virtio-input-host-pci", "virtio-input-host",
QEMU_ARCH_ALL & ~QEMU_ARCH_S390X },
+ { "virtio-iommu-pci", "virtio-iommu", QEMU_ARCH_ALL & ~QEMU_ARCH_S390X },
{ "virtio-keyboard-ccw", "virtio-keyboard", QEMU_ARCH_S390X },
{ "virtio-keyboard-pci", "virtio-keyboard",
QEMU_ARCH_ALL & ~QEMU_ARCH_S390X },
}
-static void set_memory_options(uint64_t *ram_slots, ram_addr_t *maxram_size,
+static bool set_memory_options(uint64_t *ram_slots, ram_addr_t *maxram_size,
MachineClass *mc)
{
uint64_t sz;
exit(EXIT_FAILURE);
}
- if (current_machine->ram_memdev_id) {
- Object *backend;
- ram_addr_t backend_size;
-
- backend = object_resolve_path_type(current_machine->ram_memdev_id,
- TYPE_MEMORY_BACKEND, NULL);
- backend_size = object_property_get_uint(backend, "size", &error_abort);
- if (mem_str && backend_size != ram_size) {
- error_report("Size specified by -m option must match size of "
- "explicitly specified 'memory-backend' property");
- exit(EXIT_FAILURE);
- }
- ram_size = backend_size;
- }
-
- if (!xen_enabled()) {
- /* On 32-bit hosts, QEMU is limited by virtual address space */
- if (ram_size > (2047 << 20) && HOST_LONG_BITS == 32) {
- error_report("at most 2047 MB RAM can be simulated");
- exit(1);
- }
- }
-
loc_pop(&loc);
+ return !!mem_str;
}
static int global_init_func(void *opaque, QemuOpts *opts, Error **errp)
bool list_data_dirs = false;
char *dir, **dirs;
const char *mem_path = NULL;
+ bool have_custom_ram_size;
BlockdevOptionsQueue bdo_queue = QSIMPLEQ_HEAD_INITIALIZER(bdo_queue);
QemuPluginList plugin_list = QTAILQ_HEAD_INITIALIZER(plugin_list);
int mem_prealloc = 0; /* force preallocation of physical target memory */
machine_class = select_machine();
object_set_machine_compat_props(machine_class->compat_props);
+ have_custom_ram_size = set_memory_options(&ram_slots, &maxram_size,
+ machine_class);
+
os_daemonize();
/*
current_machine->cpu_type = parse_cpu_option(cpu_option);
}
- set_memory_options(&ram_slots, &maxram_size, machine_class);
+ if (current_machine->ram_memdev_id) {
+ Object *backend;
+ ram_addr_t backend_size;
+
+ backend = object_resolve_path_type(current_machine->ram_memdev_id,
+ TYPE_MEMORY_BACKEND, NULL);
+ backend_size = object_property_get_uint(backend, "size", &error_abort);
+ if (have_custom_ram_size && backend_size != ram_size) {
+ error_report("Size specified by -m option must match size of "
+ "explicitly specified 'memory-backend' property");
+ exit(EXIT_FAILURE);
+ }
+ ram_size = backend_size;
+ }
+
+ if (!xen_enabled()) {
+ /* On 32-bit hosts, QEMU is limited by virtual address space */
+ if (ram_size > (2047 << 20) && HOST_LONG_BITS == 32) {
+ error_report("at most 2047 MB RAM can be simulated");
+ exit(1);
+ }
+ }
+
current_machine->ram_size = ram_size;
current_machine->maxram_size = maxram_size;
current_machine->ram_slots = ram_slots;
#define GET_C_LW_IMM(inst) ((extract32(inst, 6, 1) << 2) \
| (extract32(inst, 10, 3) << 3) \
| (extract32(inst, 5, 1) << 6))
-#define GET_C_LD_IMM(inst) ((extract32(inst, 10, 3) << 3) \
- | (extract32(inst, 5, 2) << 6))
+#define GET_C_LD_IMM(inst) ((extract16(inst, 10, 3) << 3) \
+ | (extract16(inst, 5, 2) << 6))
#define GET_C_J_IMM(inst) ((extract32(inst, 3, 3) << 1) \
| (extract32(inst, 11, 1) << 4) \
| (extract32(inst, 2, 1) << 5) \
#define GET_C_RD(inst) GET_RD(inst)
#define GET_C_RS1(inst) GET_RD(inst)
#define GET_C_RS2(inst) extract32(inst, 2, 5)
-#define GET_C_RS1S(inst) (8 + extract32(inst, 7, 3))
-#define GET_C_RS2S(inst) (8 + extract32(inst, 2, 3))
+#define GET_C_RS1S(inst) (8 + extract16(inst, 7, 3))
+#define GET_C_RS2S(inst) (8 + extract16(inst, 2, 3))
#endif
/* pc_succ_insn points to the instruction following base.pc_next */
target_ulong pc_succ_insn;
target_ulong priv_ver;
- uint32_t opcode;
uint32_t mstatus_fs;
uint32_t misa;
uint32_t mem_idx;
tcg_temp_free_i32(t0);
}
-static void decode_RV32_64C0(DisasContext *ctx)
+static void decode_RV32_64C0(DisasContext *ctx, uint16_t opcode)
{
- uint8_t funct3 = extract32(ctx->opcode, 13, 3);
- uint8_t rd_rs2 = GET_C_RS2S(ctx->opcode);
- uint8_t rs1s = GET_C_RS1S(ctx->opcode);
+ uint8_t funct3 = extract16(opcode, 13, 3);
+ uint8_t rd_rs2 = GET_C_RS2S(opcode);
+ uint8_t rs1s = GET_C_RS1S(opcode);
switch (funct3) {
case 3:
#if defined(TARGET_RISCV64)
/* C.LD(RV64/128) -> ld rd', offset[7:3](rs1')*/
gen_load_c(ctx, OPC_RISC_LD, rd_rs2, rs1s,
- GET_C_LD_IMM(ctx->opcode));
+ GET_C_LD_IMM(opcode));
#else
/* C.FLW (RV32) -> flw rd', offset[6:2](rs1')*/
gen_fp_load(ctx, OPC_RISC_FLW, rd_rs2, rs1s,
- GET_C_LW_IMM(ctx->opcode));
+ GET_C_LW_IMM(opcode));
#endif
break;
case 7:
#if defined(TARGET_RISCV64)
/* C.SD (RV64/128) -> sd rs2', offset[7:3](rs1')*/
gen_store_c(ctx, OPC_RISC_SD, rs1s, rd_rs2,
- GET_C_LD_IMM(ctx->opcode));
+ GET_C_LD_IMM(opcode));
#else
/* C.FSW (RV32) -> fsw rs2', offset[6:2](rs1')*/
gen_fp_store(ctx, OPC_RISC_FSW, rs1s, rd_rs2,
- GET_C_LW_IMM(ctx->opcode));
+ GET_C_LW_IMM(opcode));
#endif
break;
}
}
-static void decode_RV32_64C(DisasContext *ctx)
+static void decode_RV32_64C(DisasContext *ctx, uint16_t opcode)
{
- uint8_t op = extract32(ctx->opcode, 0, 2);
+ uint8_t op = extract16(opcode, 0, 2);
switch (op) {
case 0:
- decode_RV32_64C0(ctx);
+ decode_RV32_64C0(ctx, opcode);
break;
}
}
/* Include the auto-generated decoder for 16 bit insn */
#include "decode_insn16.inc.c"
-static void decode_opc(DisasContext *ctx)
+static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t opcode)
{
/* check for compressed insn */
- if (extract32(ctx->opcode, 0, 2) != 3) {
+ if (extract16(opcode, 0, 2) != 3) {
if (!has_ext(ctx, RVC)) {
gen_exception_illegal(ctx);
} else {
ctx->pc_succ_insn = ctx->base.pc_next + 2;
- if (!decode_insn16(ctx, ctx->opcode)) {
+ if (!decode_insn16(ctx, opcode)) {
/* fall back to old decoder */
- decode_RV32_64C(ctx);
+ decode_RV32_64C(ctx, opcode);
}
}
} else {
+ uint32_t opcode32 = opcode;
+ opcode32 = deposit32(opcode32, 16, 16,
+ translator_lduw(env, ctx->base.pc_next + 2));
ctx->pc_succ_insn = ctx->base.pc_next + 4;
- if (!decode_insn32(ctx, ctx->opcode)) {
+ if (!decode_insn32(ctx, opcode32)) {
gen_exception_illegal(ctx);
}
}
{
DisasContext *ctx = container_of(dcbase, DisasContext, base);
CPURISCVState *env = cpu->env_ptr;
+ uint16_t opcode16 = translator_lduw(env, ctx->base.pc_next);
- ctx->opcode = translator_ldl(env, ctx->base.pc_next);
- decode_opc(ctx);
+ decode_opc(env, ctx, opcode16);
ctx->base.pc_next = ctx->pc_succ_insn;
if (ctx->base.is_jmp == DISAS_NEXT) {
S390CPU *cpu = S390_CPU(s);
uint64_t spsw = ldq_phys(s->as, 0);
- cpu->env.psw.mask = spsw & 0xffffffff80000000ULL;
+ cpu->env.psw.mask = spsw & PSW_MASK_SHORT_CTRL;
/*
* Invert short psw indication, so SIE will report a specification
* exception if it was not set.
*/
cpu->env.psw.mask ^= PSW_MASK_SHORTPSW;
- cpu->env.psw.addr = spsw & 0x7fffffffULL;
+ cpu->env.psw.addr = spsw & PSW_MASK_SHORT_ADDR;
s390_cpu_set_state(S390_CPU_STATE_OPERATING, cpu);
}
}
/* Reset state inside the kernel that we cannot access yet from QEMU. */
- if (kvm_enabled() && type != S390_CPU_RESET_NORMAL) {
- kvm_s390_reset_vcpu(cpu);
+ if (kvm_enabled()) {
+ switch (type) {
+ case S390_CPU_RESET_CLEAR:
+ kvm_s390_reset_vcpu_clear(cpu);
+ break;
+ case S390_CPU_RESET_INITIAL:
+ kvm_s390_reset_vcpu_initial(cpu);
+ break;
+ case S390_CPU_RESET_NORMAL:
+ kvm_s390_reset_vcpu_normal(cpu);
+ break;
+ }
}
}
#define PSW_MASK_RI 0x0000008000000000ULL
#define PSW_MASK_64 0x0000000100000000ULL
#define PSW_MASK_32 0x0000000080000000ULL
-#define PSW_MASK_ESA_ADDR 0x000000007fffffffULL
+#define PSW_MASK_SHORT_ADDR 0x000000007fffffffULL
+#define PSW_MASK_SHORT_CTRL 0xffffffff80000000ULL
#undef PSW_ASC_PRIMARY
#undef PSW_ASC_ACCREG
static inline bool is_special_wait_psw(uint64_t psw_addr)
{
/* signal quiesce */
- return psw_addr == 0xfffUL;
+ return (psw_addr & 0xfffUL) == 0xfffUL;
}
void s390_handle_wait(S390CPU *cpu)
{
}
-void kvm_s390_reset_vcpu(S390CPU *cpu)
+void kvm_s390_reset_vcpu_initial(S390CPU *cpu)
+{
+}
+
+void kvm_s390_reset_vcpu_clear(S390CPU *cpu)
+{
+}
+
+void kvm_s390_reset_vcpu_normal(S390CPU *cpu)
{
}
static int cap_ri;
static int cap_gs;
static int cap_hpage_1m;
+static int cap_vcpu_resets;
static int active_cmma;
cap_async_pf = kvm_check_extension(s, KVM_CAP_ASYNC_PF);
cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP);
cap_s390_irq = kvm_check_extension(s, KVM_CAP_S390_INJECT_IRQ);
+ cap_vcpu_resets = kvm_check_extension(s, KVM_CAP_S390_VCPU_RESETS);
if (!kvm_check_extension(s, KVM_CAP_S390_GMAP)
|| !kvm_check_extension(s, KVM_CAP_S390_COW)) {
return 0;
}
-void kvm_s390_reset_vcpu(S390CPU *cpu)
+static void kvm_s390_reset_vcpu(S390CPU *cpu, unsigned long type)
{
CPUState *cs = CPU(cpu);
- /* The initial reset call is needed here to reset in-kernel
- * vcpu data that we can't access directly from QEMU
- * (i.e. with older kernels which don't support sync_regs/ONE_REG).
- * Before this ioctl cpu_synchronize_state() is called in common kvm
- * code (kvm-all) */
- if (kvm_vcpu_ioctl(cs, KVM_S390_INITIAL_RESET, NULL)) {
- error_report("Initial CPU reset failed on CPU %i", cs->cpu_index);
+ /*
+ * The reset call is needed here to reset in-kernel vcpu data that
+ * we can't access directly from QEMU (i.e. with older kernels
+ * which don't support sync_regs/ONE_REG). Before this ioctl
+ * cpu_synchronize_state() is called in common kvm code
+ * (kvm-all).
+ */
+ if (kvm_vcpu_ioctl(cs, type)) {
+ error_report("CPU reset failed on CPU %i type %lx",
+ cs->cpu_index, type);
+ }
+}
+
+void kvm_s390_reset_vcpu_initial(S390CPU *cpu)
+{
+ kvm_s390_reset_vcpu(cpu, KVM_S390_INITIAL_RESET);
+}
+
+void kvm_s390_reset_vcpu_clear(S390CPU *cpu)
+{
+ if (cap_vcpu_resets) {
+ kvm_s390_reset_vcpu(cpu, KVM_S390_CLEAR_RESET);
+ } else {
+ kvm_s390_reset_vcpu(cpu, KVM_S390_INITIAL_RESET);
+ }
+}
+
+void kvm_s390_reset_vcpu_normal(S390CPU *cpu)
+{
+ if (cap_vcpu_resets) {
+ kvm_s390_reset_vcpu(cpu, KVM_S390_NORMAL_RESET);
}
}
int vq, bool assign);
int kvm_s390_cmma_active(void);
void kvm_s390_cmma_reset(void);
-void kvm_s390_reset_vcpu(S390CPU *cpu);
+void kvm_s390_reset_vcpu_clear(S390CPU *cpu);
+void kvm_s390_reset_vcpu_normal(S390CPU *cpu);
+void kvm_s390_reset_vcpu_initial(S390CPU *cpu);
int kvm_s390_set_mem_limit(uint64_t new_limit, uint64_t *hw_limit);
void kvm_s390_set_max_pagesize(uint64_t pagesize, Error **errp);
void kvm_s390_crypto_reset(void);
/* Operate. */
switch (s->fields.op2) {
- case 0x55: /* AND */
+ case 0x54: /* AND */
tcg_gen_ori_i64(o->in2, o->in2, ~mask);
tcg_gen_and_i64(o->out, o->out, o->in2);
break;
}
}
+static inline TCGv plugin_prep_mem_callbacks(TCGv vaddr)
+{
+#ifdef CONFIG_PLUGIN
+ if (tcg_ctx->plugin_insn != NULL) {
+ /* Save a copy of the vaddr for use after a load. */
+ TCGv temp = tcg_temp_new();
+ tcg_gen_mov_tl(temp, vaddr);
+ return temp;
+ }
+#endif
+ return vaddr;
+}
+
static inline void plugin_gen_mem_callbacks(TCGv vaddr, uint16_t info)
{
#ifdef CONFIG_PLUGIN
- if (tcg_ctx->plugin_insn == NULL) {
- return;
+ if (tcg_ctx->plugin_insn != NULL) {
+ plugin_gen_empty_mem_callback(vaddr, info);
+ tcg_temp_free(vaddr);
}
- plugin_gen_empty_mem_callback(vaddr, info);
#endif
}
}
}
+ addr = plugin_prep_mem_callbacks(addr);
gen_ldst_i32(INDEX_op_qemu_ld_i32, val, addr, memop, idx);
plugin_gen_mem_callbacks(addr, info);
memop &= ~MO_BSWAP;
}
+ addr = plugin_prep_mem_callbacks(addr);
gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx);
plugin_gen_mem_callbacks(addr, info);
}
}
+ addr = plugin_prep_mem_callbacks(addr);
gen_ldst_i64(INDEX_op_qemu_ld_i64, val, addr, memop, idx);
plugin_gen_mem_callbacks(addr, info);
memop &= ~MO_BSWAP;
}
+ addr = plugin_prep_mem_callbacks(addr);
gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
plugin_gen_mem_callbacks(addr, info);
eval `grep SRC_PATH= config-host.mak`
+old_allowed_dif=`grep -v -e 'List of comma-separated changed AML files to ignore' ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h`
+
echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h
echo "The files were rebuilt and can be added to git."
+
+if [ -z "$old_allowed_dif" ]; then
+ echo "Note! Please do not commit expected files with source changes"
+ echo "Note! Please follow the process documented in ${SRC_PATH}/tests/qtest/bios-tables-test.c"
+fi
static void plugin_exit(qemu_plugin_id_t id, void *p)
{
- g_autofree gchar *out;
- out = g_strdup_printf("bb's: %" PRIu64", insns: %" PRIu64 "\n",
- bb_count, insn_count);
+ g_autofree gchar *out = g_strdup_printf(
+ "bb's: %" PRIu64", insns: %" PRIu64 "\n",
+ bb_count, insn_count);
qemu_plugin_outs(out);
}
return ea->count > eb->count ? -1 : 1;
}
+static void free_record(gpointer data)
+{
+ InsnExecCount *rec = (InsnExecCount *) data;
+ g_free(rec->insn);
+ g_free(rec);
+}
+
static void plugin_exit(qemu_plugin_id_t id, void *p)
{
g_autoptr(GString) report = g_string_new("Instruction Classes:\n");
counts = g_hash_table_get_values(insns);
if (counts && g_list_next(counts)) {
- GList *it;
-
g_string_append_printf(report,"Individual Instructions:\n");
+ counts = g_list_sort(counts, cmp_exec_count);
- it = g_list_sort(counts, cmp_exec_count);
-
- for (i = 0; i < limit && it->next; i++, it = it->next) {
- InsnExecCount *rec = (InsnExecCount *) it->data;
- g_string_append_printf(report, "Instr: %-24s\t(%ld hits)\t(op=%#08x/%s)\n",
+ for (i = 0; i < limit && g_list_next(counts);
+ i++, counts = g_list_next(counts)) {
+ InsnExecCount *rec = (InsnExecCount *) counts->data;
+ g_string_append_printf(report,
+ "Instr: %-24s\t(%ld hits)\t(op=%#08x/%s)\n",
rec->insn,
rec->count,
rec->opcode,
rec->class ?
rec->class->class : "un-categorised");
}
- g_list_free(it);
+ g_list_free(counts);
}
+ g_hash_table_destroy(insns);
+
qemu_plugin_outs(report->str);
}
static void plugin_init(void)
{
- insns = g_hash_table_new(NULL, g_direct_equal);
+ insns = g_hash_table_new_full(NULL, g_direct_equal, NULL, &free_record);
}
static void vcpu_insn_exec_before(unsigned int cpu_index, void *udata)
static void plugin_exit(qemu_plugin_id_t id, void *p)
{
- g_autofree gchar *out;
- out = g_strdup_printf("insns: %" PRIu64 "\n", insn_count);
+ g_autofree gchar *out = g_strdup_printf("insns: %" PRIu64 "\n", insn_count);
qemu_plugin_outs(out);
}
sizeB=$($QEMU_IMG info --output=json "$TEST_IMG" |
sed -n '/"actual-size":/ s/[^0-9]//gp')
-if [ $sizeA -le $sizeB ]
+if [ $sizeA -lt $sizeB ]
then
- echo "Compression ERROR"
+ echo "Compression ERROR ($sizeA < $sizeB)"
fi
$QEMU_IMG check --output=json "$TEST_IMG" |
fprintf(stderr,
"acpi-test: Warning! %.4s binary file mismatch. "
- "Actual [aml:%s], Expected [aml:%s].\n",
+ "Actual [aml:%s], Expected [aml:%s].\n"
+ "See source file tests/qtest/bios-tables-test.c "
+ "for instructions on how to update expected files.\n",
exp_sdt->aml, sdt->aml_file, exp_sdt->aml_file);
all_tables_match = all_tables_match &&
"Actual [asl:%s, aml:%s], Expected [asl:%s, aml:%s].\n",
exp_sdt->aml, sdt->asl_file, sdt->aml_file,
exp_sdt->asl_file, exp_sdt->aml_file);
+ fflush(stderr);
if (getenv("V")) {
- const char *diff_cmd = getenv("DIFF");
- if (diff_cmd) {
- int ret G_GNUC_UNUSED;
- char *diff = g_strdup_printf("%s %s %s", diff_cmd,
- exp_sdt->asl_file, sdt->asl_file);
- ret = system(diff) ;
- g_free(diff);
- } else {
- fprintf(stderr, "acpi-test: Warning. not showing "
- "difference since no diff utility is specified. "
- "Set 'DIFF' environment variable to a preferred "
- "diff utility and run 'make V=1 check' again to "
- "see ASL difference.");
- }
+ const char *diff_env = getenv("DIFF");
+ const char *diff_cmd = diff_env ? diff_env : "diff -u";
+ char *diff = g_strdup_printf("%s %s %s", diff_cmd,
+ exp_sdt->asl_file, sdt->asl_file);
+ int out = dup(STDOUT_FILENO);
+ int ret G_GNUC_UNUSED;
+
+ dup2(STDERR_FILENO, STDOUT_FILENO);
+ ret = system(diff) ;
+ dup2(out, STDOUT_FILENO);
+ close(out);
+ g_free(diff);
}
}
}
#include "qemu/rcu.h"
#include "qemu/thread.h"
-long long n_reads = 0LL;
-long n_updates = 0L;
int nthreadsrunning;
#define GOFLAG_INIT 0
#define RCU_READ_RUN 1000
#define NR_THREADS 100
-static QemuMutex counts_mutex;
static QemuThread threads[NR_THREADS];
static struct rcu_reader_data *data[NR_THREADS];
static int n_threads;
+/*
+ * Statistical counts
+ *
+ * These are the sum of local counters at the end of a run.
+ * Updates are protected by a mutex.
+ */
+static QemuMutex counts_mutex;
+long long n_reads = 0LL;
+long n_updates = 0L;
+
static void create_thread(void *(*func)(void *))
{
if (n_threads >= NR_THREADS) {
#define RCU_STRESS_PIPE_LEN 10
struct rcu_stress {
- int pipe_count;
+ int age; /* how many update cycles while not rcu_stress_current */
int mbtest;
};
struct rcu_stress rcu_stress_array[RCU_STRESS_PIPE_LEN] = { { 0 } };
struct rcu_stress *rcu_stress_current;
-int rcu_stress_idx;
-
int n_mberror;
+
+/* Updates protected by counts_mutex */
long long rcu_stress_count[RCU_STRESS_PIPE_LEN + 1];
while (goflag == GOFLAG_RUN) {
rcu_read_lock();
p = atomic_rcu_read(&rcu_stress_current);
- if (p->mbtest == 0) {
+ if (atomic_read(&p->mbtest) == 0) {
n_mberror++;
}
rcu_read_lock();
garbage++;
}
rcu_read_unlock();
- pc = p->pipe_count;
+ pc = atomic_read(&p->age);
rcu_read_unlock();
if ((pc > RCU_STRESS_PIPE_LEN) || (pc < 0)) {
pc = RCU_STRESS_PIPE_LEN;
return NULL;
}
+/*
+ * Stress Test Updater
+ *
+ * The updater cycles around updating rcu_stress_current to point at
+ * one of the rcu_stress_array_entries and resets it's age. It
+ * then increments the age of all the other entries. The age
+ * will be read under an rcu_read_lock() and distribution of values
+ * calculated. The final result gives an indication of how many
+ * previously current rcu_stress entries are in flight until the RCU
+ * cycle complete.
+ */
static void *rcu_update_stress_test(void *arg)
{
- int i;
- struct rcu_stress *p;
+ int i, rcu_stress_idx = 0;
+ struct rcu_stress *cp = atomic_read(&rcu_stress_current);
rcu_register_thread();
-
*(struct rcu_reader_data **)arg = &rcu_reader;
+
while (goflag == GOFLAG_INIT) {
g_usleep(1000);
}
+
while (goflag == GOFLAG_RUN) {
- i = rcu_stress_idx + 1;
- if (i >= RCU_STRESS_PIPE_LEN) {
- i = 0;
+ struct rcu_stress *p;
+ rcu_stress_idx++;
+ if (rcu_stress_idx >= RCU_STRESS_PIPE_LEN) {
+ rcu_stress_idx = 0;
}
- p = &rcu_stress_array[i];
- p->mbtest = 0;
+ p = &rcu_stress_array[rcu_stress_idx];
+ /* catching up with ourselves would be a bug */
+ assert(p != cp);
+ atomic_set(&p->mbtest, 0);
smp_mb();
- p->pipe_count = 0;
- p->mbtest = 1;
+ atomic_set(&p->age, 0);
+ atomic_set(&p->mbtest, 1);
atomic_rcu_set(&rcu_stress_current, p);
- rcu_stress_idx = i;
+ cp = p;
+ /*
+ * New RCU structure is now live, update pipe counts on old
+ * ones.
+ */
for (i = 0; i < RCU_STRESS_PIPE_LEN; i++) {
if (i != rcu_stress_idx) {
- rcu_stress_array[i].pipe_count++;
+ atomic_set(&rcu_stress_array[i].age,
+ rcu_stress_array[i].age + 1);
}
}
synchronize_rcu();
int i;
rcu_stress_current = &rcu_stress_array[0];
- rcu_stress_current->pipe_count = 0;
+ rcu_stress_current->age = 0;
rcu_stress_current->mbtest = 1;
for (i = 0; i < nreaders; i++) {
create_thread(rcu_read_stress_test);
int i;
rcu_stress_current = &rcu_stress_array[0];
- rcu_stress_current->pipe_count = 0;
+ rcu_stress_current->age = 0;
rcu_stress_current->mbtest = 1;
for (i = 0; i < nreaders; i++) {
create_thread(rcu_read_stress_test);
static void usage(int argc, char *argv[])
{
- fprintf(stderr, "Usage: %s [nreaders [ perf | stress ] ]\n", argv[0]);
+ fprintf(stderr, "Usage: %s [nreaders [ [r|u]perf | stress [duration]]\n",
+ argv[0]);
exit(-1);
}
# If TCG debugging is enabled things are a lot slower
ifeq ($(CONFIG_DEBUG_TCG),y)
-TIMEOUT=45
+TIMEOUT=60
else
TIMEOUT=15
endif
$(foreach p,$(PLUGINS), \
$(foreach t,$(TESTS),\
$(eval run-plugin-$(t)-with-$(p): $t $p) \
- $(eval run-plugin-$(t)-with-$(p): TIMEOUT=30) \
+ $(eval run-plugin-$(t)-with-$(p): TIMEOUT=60) \
$(eval RUN_TESTS+=run-plugin-$(t)-with-$(p))))
endif
$(call skip-test, "BUILD of $@", "missing compiler support")
run-pauth-3:
$(call skip-test, "RUN of pauth-3", "not built")
+run-plugin-pauth-3-with-%:
+ $(call skip-test, "RUN of pauth-3 ($*)", "not built")
endif
#include <stdint.h>
#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define TESTS 1000
int main()
{
- uintptr_t x, y;
+ int i, count = 0;
+ float perc;
+ void *base = malloc(TESTS);
+
+ for (i = 0; i < TESTS; i++) {
+ uintptr_t in, x, y;
+
+ in = i + (uintptr_t) base;
+
+ asm("mov %0, %[in]\n\t"
+ "pacia %0, sp\n\t" /* sigill if pauth not supported */
+ "eor %0, %0, #4\n\t" /* corrupt single bit */
+ "mov %1, %0\n\t"
+ "autia %1, sp\n\t" /* validate corrupted pointer */
+ "xpaci %0\n\t" /* strip pac from corrupted pointer */
+ : /* out */ "=r"(x), "=r"(y)
+ : /* in */ [in] "r" (in)
+ : /* clobbers */);
- asm("mov %0, lr\n\t"
- "pacia %0, sp\n\t" /* sigill if pauth not supported */
- "eor %0, %0, #4\n\t" /* corrupt single bit */
- "mov %1, %0\n\t"
- "autia %1, sp\n\t" /* validate corrupted pointer */
- "xpaci %0\n\t" /* strip pac from corrupted pointer */
- : "=r"(x), "=r"(y));
+ /*
+ * Once stripped, the corrupted pointer is of the form 0x0000...wxyz.
+ * We expect the autia to indicate failure, producing a pointer of the
+ * form 0x000e....wxyz. Use xpaci and != for the test, rather than
+ * extracting explicit bits from the top, because the location of the
+ * error code "e" depends on the configuration of virtual memory.
+ */
+ if (x != y) {
+ count++;
+ }
- /*
- * Once stripped, the corrupted pointer is of the form 0x0000...wxyz.
- * We expect the autia to indicate failure, producing a pointer of the
- * form 0x000e....wxyz. Use xpaci and != for the test, rather than
- * extracting explicit bits from the top, because the location of the
- * error code "e" depends on the configuration of virtual memory.
- */
- assert(x != y);
- return 0;
+ }
+ perc = (float) count / (float) TESTS;
+ printf("Checks Passed: %0.2f%%", perc * 100.0);
+ assert(perc > 0.95);
+ return 0;
}
echo "CROSS_CC_HAS_SVE=y" >> $config_target_mak
fi
if do_compiler "$target_compiler" $target_compiler_cflags \
- -march=-march=armv8.3-a -o $TMPE $TMPC; then
+ -march=armv8.3-a -o $TMPE $TMPC; then
echo "CROSS_CC_HAS_ARMV8_3=y" >> $config_target_mak
fi
;;
#ifdef CONFIG_TRACE_SIMPLE
st_set_trace_file(file);
#elif defined CONFIG_TRACE_LOG
- /* If both the simple and the log backends are enabled, "--trace file"
- * only applies to the simple backend; use "-D" for the log backend.
+ /*
+ * If both the simple and the log backends are enabled, "--trace file"
+ * only applies to the simple backend; use "-D" for the log
+ * backend. However we should only override -D if we actually have
+ * something to override it with.
*/
- qemu_set_log_filename(file, &error_fatal);
+ if (file) {
+ qemu_set_log_filename(file, &error_fatal);
+ }
#else
if (file) {
fprintf(stderr, "error: --trace file=...: "