| File: | src/usr.sbin/vmd/vioblk.c |
| Warning: | line 231, column 12 Array access (via field 'disk_fd') results in an undefined pointer dereference |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* $OpenBSD: vioblk.c,v 1.9 2023/09/26 01:53:54 dv Exp $ */ | |||
| 2 | ||||
| 3 | /* | |||
| 4 | * Copyright (c) 2023 Dave Voutila <dv@openbsd.org> | |||
| 5 | * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> | |||
| 6 | * | |||
| 7 | * Permission to use, copy, modify, and distribute this software for any | |||
| 8 | * purpose with or without fee is hereby granted, provided that the above | |||
| 9 | * copyright notice and this permission notice appear in all copies. | |||
| 10 | * | |||
| 11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |||
| 12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |||
| 13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |||
| 14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |||
| 15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |||
| 16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |||
| 17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |||
| 18 | */ | |||
| 19 | #include <sys/mman.h> | |||
| 20 | #include <sys/param.h> /* PAGE_SIZE */ | |||
| 21 | ||||
| 22 | #include <dev/pci/virtio_pcireg.h> | |||
| 23 | #include <dev/pv/vioblkreg.h> | |||
| 24 | #include <dev/pv/virtioreg.h> | |||
| 25 | ||||
| 26 | #include <errno(*__errno()).h> | |||
| 27 | #include <event.h> | |||
| 28 | #include <fcntl.h> | |||
| 29 | #include <stdlib.h> | |||
| 30 | #include <string.h> | |||
| 31 | #include <unistd.h> | |||
| 32 | ||||
| 33 | #include "atomicio.h" | |||
| 34 | #include "pci.h" | |||
| 35 | #include "virtio.h" | |||
| 36 | #include "vmd.h" | |||
| 37 | ||||
| 38 | extern char *__progname; | |||
| 39 | extern struct vmd_vm *current_vm; | |||
| 40 | struct iovec io_v[VIOBLK_QUEUE_SIZE128]; | |||
| 41 | ||||
| 42 | static const char *disk_type(int); | |||
| 43 | static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *, | |||
| 44 | int8_t *); | |||
| 45 | static int handle_io_write(struct viodev_msg *, struct virtio_dev *); | |||
| 46 | ||||
| 47 | static void vioblk_update_qs(struct vioblk_dev *); | |||
| 48 | static void vioblk_update_qa(struct vioblk_dev *); | |||
| 49 | static int vioblk_notifyq(struct vioblk_dev *); | |||
| 50 | static ssize_t vioblk_rw(struct vioblk_dev *, int, off_t, | |||
| 51 | struct vring_desc *, struct vring_desc **); | |||
| 52 | ||||
| 53 | static void dev_dispatch_vm(int, short, void *); | |||
| 54 | static void handle_sync_io(int, short, void *); | |||
| 55 | ||||
| 56 | static const char * | |||
| 57 | disk_type(int type) | |||
| 58 | { | |||
| 59 | switch (type) { | |||
| 60 | case VMDF_RAW0x01: return "raw"; | |||
| 61 | case VMDF_QCOW20x02: return "qcow2"; | |||
| 62 | } | |||
| 63 | return "unknown"; | |||
| 64 | } | |||
| 65 | ||||
| 66 | __dead__attribute__((__noreturn__)) void | |||
| 67 | vioblk_main(int fd, int fd_vmm) | |||
| 68 | { | |||
| 69 | struct virtio_dev dev; | |||
| 70 | struct vioblk_dev *vioblk; | |||
| ||||
| 71 | struct viodev_msg msg; | |||
| 72 | struct vmd_vm vm; | |||
| 73 | struct vm_create_params *vcp; | |||
| 74 | ssize_t sz; | |||
| 75 | off_t szp = 0; | |||
| 76 | int i, ret, type; | |||
| 77 | ||||
| 78 | /* | |||
| 79 | * stdio - needed for read/write to disk fds and channels to the vm. | |||
| 80 | * vmm + proc - needed to create shared vm mappings. | |||
| 81 | */ | |||
| 82 | if (pledge("stdio vmm proc", NULL((void *)0)) == -1) | |||
| 83 | fatal("pledge"); | |||
| 84 | ||||
| 85 | /* Zero and initialize io work queue. */ | |||
| 86 | memset(io_v, 0, nitems(io_v)(sizeof((io_v)) / sizeof((io_v)[0]))*sizeof(io_v[0])); | |||
| 87 | ||||
| 88 | /* Receive our virtio_dev, mostly preconfigured. */ | |||
| 89 | memset(&dev, 0, sizeof(dev)); | |||
| 90 | sz = atomicio(read, fd, &dev, sizeof(dev)); | |||
| 91 | if (sz != sizeof(dev)) { | |||
| 92 | ret = errno(*__errno()); | |||
| 93 | log_warn("failed to receive vioblk"); | |||
| 94 | goto fail; | |||
| 95 | } | |||
| 96 | if (dev.dev_type != VMD_DEVTYPE_DISK'd') { | |||
| 97 | ret = EINVAL22; | |||
| 98 | log_warn("received invalid device type"); | |||
| 99 | goto fail; | |||
| 100 | } | |||
| 101 | dev.sync_fd = fd; | |||
| 102 | vioblk = &dev.vioblk; | |||
| 103 | ||||
| 104 | log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, " | |||
| 105 | "async fd = %d, capacity = %lld seg_max = %u, vmm fd = %d", | |||
| 106 | __func__, vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, | |||
| 107 | vioblk->capacity, vioblk->seg_max, fd_vmm); | |||
| 108 | ||||
| 109 | /* Receive our vm information from the vm process. */ | |||
| 110 | memset(&vm, 0, sizeof(vm)); | |||
| 111 | sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm)); | |||
| 112 | if (sz != sizeof(vm)) { | |||
| 113 | ret = EIO5; | |||
| 114 | log_warnx("failed to receive vm details"); | |||
| 115 | goto fail; | |||
| 116 | } | |||
| 117 | vcp = &vm.vm_params.vmc_params; | |||
| 118 | current_vm = &vm; | |||
| 119 | ||||
| 120 | setproctitle("%s/vioblk%d", vcp->vcp_name, vioblk->idx); | |||
| 121 | log_procinit("vm/%s/vioblk%d", vcp->vcp_name, vioblk->idx); | |||
| 122 | ||||
| 123 | /* Now that we have our vm information, we can remap memory. */ | |||
| 124 | ret = remap_guest_mem(&vm, fd_vmm); | |||
| 125 | if (ret) { | |||
| 126 | log_warnx("failed to remap guest memory"); | |||
| 127 | goto fail; | |||
| 128 | } | |||
| 129 | ||||
| 130 | /* | |||
| 131 | * We no longer need /dev/vmm access. | |||
| 132 | */ | |||
| 133 | close_fd(fd_vmm); | |||
| 134 | if (pledge("stdio", NULL((void *)0)) == -1) | |||
| 135 | fatal("pledge2"); | |||
| 136 | ||||
| 137 | /* Initialize the virtio block abstractions. */ | |||
| 138 | type = vm.vm_params.vmc_disktypes[vioblk->idx]; | |||
| 139 | switch (type) { | |||
| 140 | case VMDF_RAW0x01: | |||
| 141 | ret = virtio_raw_init(&vioblk->file, &szp, vioblk->disk_fd, | |||
| 142 | vioblk->ndisk_fd); | |||
| 143 | break; | |||
| 144 | case VMDF_QCOW20x02: | |||
| 145 | ret = virtio_qcow2_init(&vioblk->file, &szp, vioblk->disk_fd, | |||
| 146 | vioblk->ndisk_fd); | |||
| 147 | break; | |||
| 148 | default: | |||
| 149 | log_warnx("invalid disk image type"); | |||
| 150 | goto fail; | |||
| 151 | } | |||
| 152 | if (ret || szp < 0) { | |||
| 153 | log_warnx("failed to init disk %s image", disk_type(type)); | |||
| 154 | goto fail; | |||
| 155 | } | |||
| 156 | vioblk->capacity = szp / 512; | |||
| 157 | log_debug("%s: initialized vioblk%d with %s image (capacity=%lld)", | |||
| 158 | __func__, vioblk->idx, disk_type(type), vioblk->capacity); | |||
| 159 | ||||
| 160 | /* If we're restoring hardware, reinitialize the virtqueue hva. */ | |||
| 161 | if (vm.vm_state & VM_STATE_RECEIVED0x08) | |||
| 162 | vioblk_update_qa(vioblk); | |||
| 163 | ||||
| 164 | /* Initialize libevent so we can start wiring event handlers. */ | |||
| 165 | event_init(); | |||
| 166 | ||||
| 167 | /* Wire up an async imsg channel. */ | |||
| 168 | log_debug("%s: wiring in async vm event handler (fd=%d)", __func__, | |||
| 169 | dev.async_fd); | |||
| 170 | if (vm_device_pipe(&dev, dev_dispatch_vm)) { | |||
| 171 | ret = EIO5; | |||
| 172 | log_warnx("vm_device_pipe"); | |||
| 173 | goto fail; | |||
| 174 | } | |||
| 175 | ||||
| 176 | /* Configure our sync channel event handler. */ | |||
| 177 | log_debug("%s: wiring in sync channel handler (fd=%d)", __func__, | |||
| 178 | dev.sync_fd); | |||
| 179 | if (fcntl(dev.sync_fd, F_SETFL4, O_NONBLOCK0x0004) == -1) { | |||
| 180 | ret = errno(*__errno()); | |||
| 181 | log_warn("%s: fcntl", __func__); | |||
| 182 | goto fail; | |||
| 183 | } | |||
| 184 | imsg_init(&dev.sync_iev.ibuf, dev.sync_fd); | |||
| 185 | dev.sync_iev.handler = handle_sync_io; | |||
| 186 | dev.sync_iev.data = &dev; | |||
| 187 | dev.sync_iev.events = EV_READ0x02; | |||
| 188 | imsg_event_add(&dev.sync_iev); | |||
| 189 | ||||
| 190 | /* Send a ready message over the sync channel. */ | |||
| 191 | log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name); | |||
| 192 | memset(&msg, 0, sizeof(msg)); | |||
| 193 | msg.type = VIODEV_MSG_READY1; | |||
| 194 | imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, | |||
| 195 | sizeof(msg)); | |||
| 196 | ||||
| 197 | /* Send a ready message over the async channel. */ | |||
| 198 | log_debug("%s: sending heartbeat", __func__); | |||
| 199 | ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1, | |||
| 200 | &msg, sizeof(msg)); | |||
| 201 | if (ret == -1) { | |||
| 202 | log_warnx("%s: failed to send async ready message!", __func__); | |||
| 203 | goto fail; | |||
| 204 | } | |||
| 205 | ||||
| 206 | /* Engage the event loop! */ | |||
| 207 | ret = event_dispatch(); | |||
| 208 | ||||
| 209 | if (ret == 0) { | |||
| 210 | /* Clean shutdown. */ | |||
| 211 | close_fd(dev.sync_fd); | |||
| 212 | close_fd(dev.async_fd); | |||
| 213 | for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++) | |||
| 214 | close_fd(vioblk->disk_fd[i]); | |||
| 215 | _exit(0); | |||
| 216 | /* NOTREACHED */ | |||
| 217 | } | |||
| 218 | ||||
| 219 | fail: | |||
| 220 | /* Try letting the vm know we've failed something. */ | |||
| 221 | memset(&msg, 0, sizeof(msg)); | |||
| 222 | msg.type = VIODEV_MSG_ERROR2; | |||
| 223 | msg.data = ret; | |||
| 224 | imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, | |||
| 225 | sizeof(msg)); | |||
| 226 | imsg_flush(&dev.sync_iev.ibuf); | |||
| 227 | ||||
| 228 | close_fd(dev.sync_fd); | |||
| 229 | close_fd(dev.async_fd); | |||
| 230 | for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++) | |||
| 231 | close_fd(vioblk->disk_fd[i]); | |||
| ||||
| 232 | _exit(ret); | |||
| 233 | /* NOTREACHED */ | |||
| 234 | } | |||
| 235 | ||||
| 236 | const char * | |||
| 237 | vioblk_cmd_name(uint32_t type) | |||
| 238 | { | |||
| 239 | switch (type) { | |||
| 240 | case VIRTIO_BLK_T_IN0: return "read"; | |||
| 241 | case VIRTIO_BLK_T_OUT1: return "write"; | |||
| 242 | case VIRTIO_BLK_T_SCSI_CMD2: return "scsi read"; | |||
| 243 | case VIRTIO_BLK_T_SCSI_CMD_OUT3: return "scsi write"; | |||
| 244 | case VIRTIO_BLK_T_FLUSH4: return "flush"; | |||
| 245 | case VIRTIO_BLK_T_FLUSH_OUT5: return "flush out"; | |||
| 246 | case VIRTIO_BLK_T_GET_ID8: return "get id"; | |||
| 247 | default: return "unknown"; | |||
| 248 | } | |||
| 249 | } | |||
| 250 | ||||
| 251 | static void | |||
| 252 | vioblk_update_qa(struct vioblk_dev *dev) | |||
| 253 | { | |||
| 254 | struct virtio_vq_info *vq_info; | |||
| 255 | void *hva = NULL((void *)0); | |||
| 256 | ||||
| 257 | /* Invalid queue? */ | |||
| 258 | if (dev->cfg.queue_select > 0) | |||
| 259 | return; | |||
| 260 | ||||
| 261 | vq_info = &dev->vq[dev->cfg.queue_select]; | |||
| 262 | vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE(4096); | |||
| 263 | ||||
| 264 | hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE128)); | |||
| 265 | if (hva == NULL((void *)0)) | |||
| 266 | fatal("vioblk_update_qa"); | |||
| 267 | vq_info->q_hva = hva; | |||
| 268 | } | |||
| 269 | ||||
| 270 | static void | |||
| 271 | vioblk_update_qs(struct vioblk_dev *dev) | |||
| 272 | { | |||
| 273 | struct virtio_vq_info *vq_info; | |||
| 274 | ||||
| 275 | /* Invalid queue? */ | |||
| 276 | if (dev->cfg.queue_select > 0) { | |||
| 277 | dev->cfg.queue_size = 0; | |||
| 278 | return; | |||
| 279 | } | |||
| 280 | ||||
| 281 | vq_info = &dev->vq[dev->cfg.queue_select]; | |||
| 282 | ||||
| 283 | /* Update queue pfn/size based on queue select */ | |||
| 284 | dev->cfg.queue_pfn = vq_info->q_gpa >> 12; | |||
| 285 | dev->cfg.queue_size = vq_info->qs; | |||
| 286 | } | |||
| 287 | ||||
| 288 | /* | |||
| 289 | * Process virtqueue notifications. If an unrecoverable error occurs, puts | |||
| 290 | * device into a "needs reset" state. | |||
| 291 | * | |||
| 292 | * Returns 1 if an we need to assert an IRQ. | |||
| 293 | */ | |||
| 294 | static int | |||
| 295 | vioblk_notifyq(struct vioblk_dev *dev) | |||
| 296 | { | |||
| 297 | uint32_t cmd_len; | |||
| 298 | uint16_t idx, cmd_desc_idx; | |||
| 299 | uint8_t ds; | |||
| 300 | off_t offset; | |||
| 301 | ssize_t sz; | |||
| 302 | int is_write, notify, i; | |||
| 303 | char *vr; | |||
| 304 | struct vring_desc *table, *desc; | |||
| 305 | struct vring_avail *avail; | |||
| 306 | struct vring_used *used; | |||
| 307 | struct virtio_blk_req_hdr *cmd; | |||
| 308 | struct virtio_vq_info *vq_info; | |||
| 309 | ||||
| 310 | /* Invalid queue? */ | |||
| 311 | if (dev->cfg.queue_notify > 0) | |||
| 312 | return (0); | |||
| 313 | ||||
| 314 | vq_info = &dev->vq[dev->cfg.queue_notify]; | |||
| 315 | idx = vq_info->last_avail; | |||
| 316 | vr = vq_info->q_hva; | |||
| 317 | if (vr == NULL((void *)0)) | |||
| 318 | fatalx("%s: null vring", __func__); | |||
| 319 | ||||
| 320 | /* Compute offsets in table of descriptors, avail ring, and used ring */ | |||
| 321 | table = (struct vring_desc *)(vr); | |||
| 322 | avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); | |||
| 323 | used = (struct vring_used *)(vr + vq_info->vq_usedoffset); | |||
| 324 | ||||
| 325 | while (idx != avail->idx) { | |||
| 326 | /* Retrieve Command descriptor. */ | |||
| 327 | cmd_desc_idx = avail->ring[idx & VIOBLK_QUEUE_MASK(128 - 1)]; | |||
| 328 | desc = &table[cmd_desc_idx]; | |||
| 329 | cmd_len = desc->len; | |||
| 330 | ||||
| 331 | /* | |||
| 332 | * Validate Command descriptor. It should be chained to another | |||
| 333 | * descriptor and not be itself writable. | |||
| 334 | */ | |||
| 335 | if ((desc->flags & VRING_DESC_F_NEXT1) == 0) { | |||
| 336 | log_warnx("%s: unchained cmd descriptor", __func__); | |||
| 337 | goto reset; | |||
| 338 | } | |||
| 339 | if (DESC_WRITABLE(desc)(((desc)->flags & 2) ? 1 : 0)) { | |||
| 340 | log_warnx("%s: invalid cmd descriptor state", __func__); | |||
| 341 | goto reset; | |||
| 342 | } | |||
| 343 | ||||
| 344 | /* Retrieve the vioblk command request. */ | |||
| 345 | cmd = hvaddr_mem(desc->addr, sizeof(*cmd)); | |||
| 346 | if (cmd == NULL((void *)0)) | |||
| 347 | goto reset; | |||
| 348 | ||||
| 349 | /* Advance to the 2nd descriptor. */ | |||
| 350 | desc = &table[desc->next & VIOBLK_QUEUE_MASK(128 - 1)]; | |||
| 351 | ||||
| 352 | /* Process each available command & chain. */ | |||
| 353 | switch (cmd->type) { | |||
| 354 | case VIRTIO_BLK_T_IN0: | |||
| 355 | case VIRTIO_BLK_T_OUT1: | |||
| 356 | /* Read (IN) & Write (OUT) */ | |||
| 357 | is_write = (cmd->type == VIRTIO_BLK_T_OUT1) ? 1 : 0; | |||
| 358 | offset = cmd->sector * VIRTIO_BLK_SECTOR_SIZE512; | |||
| 359 | sz = vioblk_rw(dev, is_write, offset, table, &desc); | |||
| 360 | if (sz == -1) | |||
| 361 | ds = VIRTIO_BLK_S_IOERR1; | |||
| 362 | else | |||
| 363 | ds = VIRTIO_BLK_S_OK0; | |||
| 364 | break; | |||
| 365 | case VIRTIO_BLK_T_GET_ID8: | |||
| 366 | /* | |||
| 367 | * We don't support this command yet. While it's not | |||
| 368 | * officially part of the virtio spec (will be in v1.2) | |||
| 369 | * there's no feature to negotiate. Linux drivers will | |||
| 370 | * often send this command regardless. | |||
| 371 | */ | |||
| 372 | ds = VIRTIO_BLK_S_UNSUPP2; | |||
| 373 | default: | |||
| 374 | log_warnx("%s: unsupported vioblk command %d", __func__, | |||
| 375 | cmd->type); | |||
| 376 | ds = VIRTIO_BLK_S_UNSUPP2; | |||
| 377 | break; | |||
| 378 | } | |||
| 379 | ||||
| 380 | /* Advance to the end of the chain, if needed. */ | |||
| 381 | i = 0; | |||
| 382 | while (desc->flags & VRING_DESC_F_NEXT1) { | |||
| 383 | desc = &table[desc->next & VIOBLK_QUEUE_MASK(128 - 1)]; | |||
| 384 | if (++i >= VIOBLK_QUEUE_SIZE128) { | |||
| 385 | /* | |||
| 386 | * If we encounter an infinite/looping chain, | |||
| 387 | * not much we can do but say we need a reset. | |||
| 388 | */ | |||
| 389 | log_warnx("%s: descriptor chain overflow", | |||
| 390 | __func__); | |||
| 391 | goto reset; | |||
| 392 | } | |||
| 393 | } | |||
| 394 | ||||
| 395 | /* Provide the status of our command processing. */ | |||
| 396 | if (!DESC_WRITABLE(desc)(((desc)->flags & 2) ? 1 : 0)) { | |||
| 397 | log_warnx("%s: status descriptor unwritable", __func__); | |||
| 398 | goto reset; | |||
| 399 | } | |||
| 400 | /* Overkill as ds is 1 byte, but validates gpa. */ | |||
| 401 | if (write_mem(desc->addr, &ds, sizeof(ds))) | |||
| 402 | log_warnx("%s: can't write device status data " | |||
| 403 | "@ 0x%llx",__func__, desc->addr); | |||
| 404 | ||||
| 405 | dev->cfg.isr_status |= 1; | |||
| 406 | notify = 1; | |||
| 407 | ||||
| 408 | used->ring[used->idx & VIOBLK_QUEUE_MASK(128 - 1)].id = cmd_desc_idx; | |||
| 409 | used->ring[used->idx & VIOBLK_QUEUE_MASK(128 - 1)].len = cmd_len; | |||
| 410 | ||||
| 411 | __sync_synchronize(); | |||
| 412 | used->idx++; | |||
| 413 | idx++; | |||
| 414 | } | |||
| 415 | ||||
| 416 | vq_info->last_avail = idx; | |||
| 417 | return (notify); | |||
| 418 | ||||
| 419 | reset: | |||
| 420 | /* | |||
| 421 | * When setting the "needs reset" flag, the driver is notified | |||
| 422 | * via a configuration change interrupt. | |||
| 423 | */ | |||
| 424 | dev->cfg.device_status |= DEVICE_NEEDS_RESET64; | |||
| 425 | dev->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE2; | |||
| 426 | return (1); | |||
| 427 | } | |||
| 428 | ||||
| 429 | static void | |||
| 430 | dev_dispatch_vm(int fd, short event, void *arg) | |||
| 431 | { | |||
| 432 | struct virtio_dev *dev = (struct virtio_dev *)arg; | |||
| 433 | struct imsgev *iev = &dev->async_iev; | |||
| 434 | struct imsgbuf *ibuf = &iev->ibuf; | |||
| 435 | struct imsg imsg; | |||
| 436 | ssize_t n = 0; | |||
| 437 | int verbose; | |||
| 438 | ||||
| 439 | if (event & EV_READ0x02) { | |||
| 440 | if ((n = imsg_read(ibuf)) == -1 && errno(*__errno()) != EAGAIN35) | |||
| 441 | fatal("%s: imsg_read", __func__); | |||
| 442 | if (n == 0) { | |||
| 443 | /* this pipe is dead, so remove the event handler */ | |||
| 444 | log_debug("%s: pipe dead (EV_READ)", __func__); | |||
| 445 | event_del(&iev->ev); | |||
| 446 | event_loopexit(NULL((void *)0)); | |||
| 447 | return; | |||
| 448 | } | |||
| 449 | } | |||
| 450 | ||||
| 451 | if (event & EV_WRITE0x04) { | |||
| 452 | if ((n = msgbuf_write(&ibuf->w)) == -1 && errno(*__errno()) != EAGAIN35) | |||
| 453 | fatal("%s: msgbuf_write", __func__); | |||
| 454 | if (n == 0) { | |||
| 455 | /* this pipe is dead, so remove the event handler */ | |||
| 456 | log_debug("%s: pipe dead (EV_WRITE)", __func__); | |||
| 457 | event_del(&iev->ev); | |||
| 458 | event_loopbreak(); | |||
| 459 | return; | |||
| 460 | } | |||
| 461 | } | |||
| 462 | ||||
| 463 | for (;;) { | |||
| 464 | if ((n = imsg_get(ibuf, &imsg)) == -1) | |||
| 465 | fatal("%s: imsg_get", __func__); | |||
| 466 | if (n == 0) | |||
| 467 | break; | |||
| 468 | ||||
| 469 | switch (imsg.hdr.type) { | |||
| 470 | case IMSG_VMDOP_PAUSE_VM: | |||
| 471 | log_debug("%s: pausing", __func__); | |||
| 472 | break; | |||
| 473 | case IMSG_VMDOP_UNPAUSE_VM: | |||
| 474 | log_debug("%s: unpausing", __func__); | |||
| 475 | break; | |||
| 476 | case IMSG_CTL_VERBOSE: | |||
| 477 | IMSG_SIZE_CHECK(&imsg, &verbose)do { if (((&imsg)->hdr.len - sizeof(struct imsg_hdr)) < sizeof(*&verbose)) fatalx("bad length imsg received (%s)" , "&verbose"); } while (0); | |||
| 478 | memcpy(&verbose, imsg.data, sizeof(verbose)); | |||
| 479 | log_setverbose(verbose); | |||
| 480 | break; | |||
| 481 | default: | |||
| 482 | log_warnx("%s: unhandled imsg type %d", __func__, | |||
| 483 | imsg.hdr.type); | |||
| 484 | break; | |||
| 485 | } | |||
| 486 | imsg_free(&imsg); | |||
| 487 | } | |||
| 488 | imsg_event_add(iev); | |||
| 489 | } | |||
| 490 | ||||
| 491 | /* | |||
| 492 | * Synchronous IO handler. | |||
| 493 | * | |||
| 494 | */ | |||
| 495 | static void | |||
| 496 | handle_sync_io(int fd, short event, void *arg) | |||
| 497 | { | |||
| 498 | struct virtio_dev *dev = (struct virtio_dev *)arg; | |||
| 499 | struct imsgev *iev = &dev->sync_iev; | |||
| 500 | struct imsgbuf *ibuf = &iev->ibuf; | |||
| 501 | struct viodev_msg msg; | |||
| 502 | struct imsg imsg; | |||
| 503 | ssize_t n; | |||
| 504 | int8_t intr = INTR_STATE_NOOP0; | |||
| 505 | ||||
| 506 | if (event & EV_READ0x02) { | |||
| 507 | if ((n = imsg_read(ibuf)) == -1 && errno(*__errno()) != EAGAIN35) | |||
| 508 | fatal("%s: imsg_read", __func__); | |||
| 509 | if (n == 0) { | |||
| 510 | /* this pipe is dead, so remove the event handler */ | |||
| 511 | log_debug("%s: vioblk pipe dead (EV_READ)", __func__); | |||
| 512 | event_del(&iev->ev); | |||
| 513 | event_loopexit(NULL((void *)0)); | |||
| 514 | return; | |||
| 515 | } | |||
| 516 | } | |||
| 517 | ||||
| 518 | if (event & EV_WRITE0x04) { | |||
| 519 | if ((n = msgbuf_write(&ibuf->w)) == -1 && errno(*__errno()) != EAGAIN35) | |||
| 520 | fatal("%s: msgbuf_write", __func__); | |||
| 521 | if (n == 0) { | |||
| 522 | /* this pipe is dead, so remove the event handler */ | |||
| 523 | log_debug("%s: vioblk pipe dead (EV_WRITE)", __func__); | |||
| 524 | event_del(&iev->ev); | |||
| 525 | event_loopexit(NULL((void *)0)); | |||
| 526 | return; | |||
| 527 | } | |||
| 528 | } | |||
| 529 | ||||
| 530 | for (;;) { | |||
| 531 | if ((n = imsg_get(ibuf, &imsg)) == -1) | |||
| 532 | fatalx("%s: imsg_get (n=%ld)", __func__, n); | |||
| 533 | if (n == 0) | |||
| 534 | break; | |||
| 535 | ||||
| 536 | /* Unpack our message. They ALL should be dev messeges! */ | |||
| 537 | IMSG_SIZE_CHECK(&imsg, &msg)do { if (((&imsg)->hdr.len - sizeof(struct imsg_hdr)) < sizeof(*&msg)) fatalx("bad length imsg received (%s)", "&msg" ); } while (0); | |||
| 538 | memcpy(&msg, imsg.data, sizeof(msg)); | |||
| 539 | imsg_free(&imsg); | |||
| 540 | ||||
| 541 | switch (msg.type) { | |||
| 542 | case VIODEV_MSG_DUMP6: | |||
| 543 | /* Dump device */ | |||
| 544 | n = atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, dev->sync_fd, dev, sizeof(*dev)); | |||
| 545 | if (n != sizeof(*dev)) { | |||
| 546 | log_warnx("%s: failed to dump vioblk device", | |||
| 547 | __func__); | |||
| 548 | break; | |||
| 549 | } | |||
| 550 | case VIODEV_MSG_IO_READ4: | |||
| 551 | /* Read IO: make sure to send a reply */ | |||
| 552 | msg.data = handle_io_read(&msg, dev, &intr); | |||
| 553 | msg.data_valid = 1; | |||
| 554 | msg.state = intr; | |||
| 555 | imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, | |||
| 556 | sizeof(msg)); | |||
| 557 | break; | |||
| 558 | case VIODEV_MSG_IO_WRITE5: | |||
| 559 | /* Write IO: no reply needed */ | |||
| 560 | if (handle_io_write(&msg, dev) == 1) | |||
| 561 | virtio_assert_pic_irq(dev, 0); | |||
| 562 | break; | |||
| 563 | case VIODEV_MSG_SHUTDOWN7: | |||
| 564 | event_del(&dev->sync_iev.ev); | |||
| 565 | event_loopbreak(); | |||
| 566 | return; | |||
| 567 | default: | |||
| 568 | fatalx("%s: invalid msg type %d", __func__, msg.type); | |||
| 569 | } | |||
| 570 | } | |||
| 571 | imsg_event_add(iev); | |||
| 572 | } | |||
| 573 | ||||
| 574 | static int | |||
| 575 | handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev) | |||
| 576 | { | |||
| 577 | struct vioblk_dev *vioblk = &dev->vioblk; | |||
| 578 | uint32_t data = msg->data; | |||
| 579 | int intr = 0; | |||
| 580 | ||||
| 581 | switch (msg->reg) { | |||
| 582 | case VIRTIO_CONFIG_DEVICE_FEATURES0: | |||
| 583 | case VIRTIO_CONFIG_QUEUE_SIZE12: | |||
| 584 | case VIRTIO_CONFIG_ISR_STATUS19: | |||
| 585 | log_warnx("%s: illegal write %x to %s", __progname, data, | |||
| 586 | virtio_reg_name(msg->reg)); | |||
| 587 | break; | |||
| 588 | case VIRTIO_CONFIG_GUEST_FEATURES4: | |||
| 589 | vioblk->cfg.guest_feature = data; | |||
| 590 | break; | |||
| 591 | case VIRTIO_CONFIG_QUEUE_PFN8: | |||
| 592 | vioblk->cfg.queue_pfn = data; | |||
| 593 | vioblk_update_qa(vioblk); | |||
| 594 | break; | |||
| 595 | case VIRTIO_CONFIG_QUEUE_SELECT14: | |||
| 596 | vioblk->cfg.queue_select = data; | |||
| 597 | vioblk_update_qs(vioblk); | |||
| 598 | break; | |||
| 599 | case VIRTIO_CONFIG_QUEUE_NOTIFY16: | |||
| 600 | /* XXX We should be stricter about status checks. */ | |||
| 601 | if (!(vioblk->cfg.device_status & DEVICE_NEEDS_RESET64)) { | |||
| 602 | vioblk->cfg.queue_notify = data; | |||
| 603 | if (vioblk_notifyq(vioblk)) | |||
| 604 | intr = 1; | |||
| 605 | } | |||
| 606 | break; | |||
| 607 | case VIRTIO_CONFIG_DEVICE_STATUS18: | |||
| 608 | vioblk->cfg.device_status = data; | |||
| 609 | if (vioblk->cfg.device_status == 0) { | |||
| 610 | vioblk->cfg.guest_feature = 0; | |||
| 611 | vioblk->cfg.queue_pfn = 0; | |||
| 612 | vioblk_update_qa(vioblk); | |||
| 613 | vioblk->cfg.queue_size = 0; | |||
| 614 | vioblk_update_qs(vioblk); | |||
| 615 | vioblk->cfg.queue_select = 0; | |||
| 616 | vioblk->cfg.queue_notify = 0; | |||
| 617 | vioblk->cfg.isr_status = 0; | |||
| 618 | vioblk->vq[0].last_avail = 0; | |||
| 619 | vioblk->vq[0].notified_avail = 0; | |||
| 620 | virtio_deassert_pic_irq(dev, msg->vcpu); | |||
| 621 | } | |||
| 622 | break; | |||
| 623 | default: | |||
| 624 | break; | |||
| 625 | } | |||
| 626 | return (intr); | |||
| 627 | } | |||
| 628 | ||||
| 629 | static uint32_t | |||
| 630 | handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr) | |||
| 631 | { | |||
| 632 | struct vioblk_dev *vioblk = &dev->vioblk; | |||
| 633 | uint8_t sz = msg->io_sz; | |||
| 634 | uint32_t data; | |||
| 635 | ||||
| 636 | if (msg->data_valid) | |||
| 637 | data = msg->data; | |||
| 638 | else | |||
| 639 | data = 0; | |||
| 640 | ||||
| 641 | switch (msg->reg) { | |||
| 642 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20: | |||
| 643 | switch (sz) { | |||
| 644 | case 4: | |||
| 645 | data = (uint32_t)(vioblk->capacity); | |||
| 646 | break; | |||
| 647 | case 2: | |||
| 648 | data &= 0xFFFF0000; | |||
| 649 | data |= (uint32_t)(vioblk->capacity) & 0xFFFF; | |||
| 650 | break; | |||
| 651 | case 1: | |||
| 652 | data &= 0xFFFFFF00; | |||
| 653 | data |= (uint32_t)(vioblk->capacity) & 0xFF; | |||
| 654 | break; | |||
| 655 | } | |||
| 656 | /* XXX handle invalid sz */ | |||
| 657 | break; | |||
| 658 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 1: | |||
| 659 | if (sz == 1) { | |||
| 660 | data &= 0xFFFFFF00; | |||
| 661 | data |= (uint32_t)(vioblk->capacity >> 8) & 0xFF; | |||
| 662 | } | |||
| 663 | /* XXX handle invalid sz */ | |||
| 664 | break; | |||
| 665 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 2: | |||
| 666 | if (sz == 1) { | |||
| 667 | data &= 0xFFFFFF00; | |||
| 668 | data |= (uint32_t)(vioblk->capacity >> 16) & 0xFF; | |||
| 669 | } else if (sz == 2) { | |||
| 670 | data &= 0xFFFF0000; | |||
| 671 | data |= (uint32_t)(vioblk->capacity >> 16) & 0xFFFF; | |||
| 672 | } | |||
| 673 | /* XXX handle invalid sz */ | |||
| 674 | break; | |||
| 675 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 3: | |||
| 676 | if (sz == 1) { | |||
| 677 | data &= 0xFFFFFF00; | |||
| 678 | data |= (uint32_t)(vioblk->capacity >> 24) & 0xFF; | |||
| 679 | } | |||
| 680 | /* XXX handle invalid sz */ | |||
| 681 | break; | |||
| 682 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 4: | |||
| 683 | switch (sz) { | |||
| 684 | case 4: | |||
| 685 | data = (uint32_t)(vioblk->capacity >> 32); | |||
| 686 | break; | |||
| 687 | case 2: | |||
| 688 | data &= 0xFFFF0000; | |||
| 689 | data |= (uint32_t)(vioblk->capacity >> 32) & 0xFFFF; | |||
| 690 | break; | |||
| 691 | case 1: | |||
| 692 | data &= 0xFFFFFF00; | |||
| 693 | data |= (uint32_t)(vioblk->capacity >> 32) & 0xFF; | |||
| 694 | break; | |||
| 695 | } | |||
| 696 | /* XXX handle invalid sz */ | |||
| 697 | break; | |||
| 698 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 5: | |||
| 699 | if (sz == 1) { | |||
| 700 | data &= 0xFFFFFF00; | |||
| 701 | data |= (uint32_t)(vioblk->capacity >> 40) & 0xFF; | |||
| 702 | } | |||
| 703 | /* XXX handle invalid sz */ | |||
| 704 | break; | |||
| 705 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 6: | |||
| 706 | if (sz == 1) { | |||
| 707 | data &= 0xFFFFFF00; | |||
| 708 | data |= (uint32_t)(vioblk->capacity >> 48) & 0xFF; | |||
| 709 | } else if (sz == 2) { | |||
| 710 | data &= 0xFFFF0000; | |||
| 711 | data |= (uint32_t)(vioblk->capacity >> 48) & 0xFFFF; | |||
| 712 | } | |||
| 713 | /* XXX handle invalid sz */ | |||
| 714 | break; | |||
| 715 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 7: | |||
| 716 | if (sz == 1) { | |||
| 717 | data &= 0xFFFFFF00; | |||
| 718 | data |= (uint32_t)(vioblk->capacity >> 56) & 0xFF; | |||
| 719 | } | |||
| 720 | /* XXX handle invalid sz */ | |||
| 721 | break; | |||
| 722 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 12: | |||
| 723 | switch (sz) { | |||
| 724 | case 4: | |||
| 725 | data = (uint32_t)(vioblk->seg_max); | |||
| 726 | break; | |||
| 727 | case 2: | |||
| 728 | data &= 0xFFFF0000; | |||
| 729 | data |= (uint32_t)(vioblk->seg_max) & 0xFFFF; | |||
| 730 | break; | |||
| 731 | case 1: | |||
| 732 | data &= 0xFFFFFF00; | |||
| 733 | data |= (uint32_t)(vioblk->seg_max) & 0xFF; | |||
| 734 | break; | |||
| 735 | } | |||
| 736 | /* XXX handle invalid sz */ | |||
| 737 | break; | |||
| 738 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 13: | |||
| 739 | if (sz == 1) { | |||
| 740 | data &= 0xFFFFFF00; | |||
| 741 | data |= (uint32_t)(vioblk->seg_max >> 8) & 0xFF; | |||
| 742 | } | |||
| 743 | /* XXX handle invalid sz */ | |||
| 744 | break; | |||
| 745 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 14: | |||
| 746 | if (sz == 1) { | |||
| 747 | data &= 0xFFFFFF00; | |||
| 748 | data |= (uint32_t)(vioblk->seg_max >> 16) & 0xFF; | |||
| 749 | } else if (sz == 2) { | |||
| 750 | data &= 0xFFFF0000; | |||
| 751 | data |= (uint32_t)(vioblk->seg_max >> 16) | |||
| 752 | & 0xFFFF; | |||
| 753 | } | |||
| 754 | /* XXX handle invalid sz */ | |||
| 755 | break; | |||
| 756 | case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI20 + 15: | |||
| 757 | if (sz == 1) { | |||
| 758 | data &= 0xFFFFFF00; | |||
| 759 | data |= (uint32_t)(vioblk->seg_max >> 24) & 0xFF; | |||
| 760 | } | |||
| 761 | /* XXX handle invalid sz */ | |||
| 762 | break; | |||
| 763 | case VIRTIO_CONFIG_DEVICE_FEATURES0: | |||
| 764 | data = vioblk->cfg.device_feature; | |||
| 765 | break; | |||
| 766 | case VIRTIO_CONFIG_GUEST_FEATURES4: | |||
| 767 | data = vioblk->cfg.guest_feature; | |||
| 768 | break; | |||
| 769 | case VIRTIO_CONFIG_QUEUE_PFN8: | |||
| 770 | data = vioblk->cfg.queue_pfn; | |||
| 771 | break; | |||
| 772 | case VIRTIO_CONFIG_QUEUE_SIZE12: | |||
| 773 | data = vioblk->cfg.queue_size; | |||
| 774 | break; | |||
| 775 | case VIRTIO_CONFIG_QUEUE_SELECT14: | |||
| 776 | data = vioblk->cfg.queue_select; | |||
| 777 | break; | |||
| 778 | case VIRTIO_CONFIG_QUEUE_NOTIFY16: | |||
| 779 | data = vioblk->cfg.queue_notify; | |||
| 780 | break; | |||
| 781 | case VIRTIO_CONFIG_DEVICE_STATUS18: | |||
| 782 | data = vioblk->cfg.device_status; | |||
| 783 | break; | |||
| 784 | case VIRTIO_CONFIG_ISR_STATUS19: | |||
| 785 | data = vioblk->cfg.isr_status; | |||
| 786 | vioblk->cfg.isr_status = 0; | |||
| 787 | if (intr != NULL((void *)0)) | |||
| 788 | *intr = INTR_STATE_DEASSERT-1; | |||
| 789 | break; | |||
| 790 | default: | |||
| 791 | return (0xFFFFFFFF); | |||
| 792 | } | |||
| 793 | ||||
| 794 | return (data); | |||
| 795 | } | |||
| 796 | ||||
| 797 | /* | |||
| 798 | * Emulate read/write io. Walks the descriptor chain, collecting io work and | |||
| 799 | * then emulates the read or write. | |||
| 800 | * | |||
| 801 | * On success, returns bytes read/written. | |||
| 802 | * On error, returns -1 and descriptor (desc) remains at its current position. | |||
| 803 | */ | |||
| 804 | static ssize_t | |||
| 805 | vioblk_rw(struct vioblk_dev *dev, int is_write, off_t offset, | |||
| 806 | struct vring_desc *desc_tbl, struct vring_desc **desc) | |||
| 807 | { | |||
| 808 | struct iovec *iov = NULL((void *)0); | |||
| 809 | ssize_t sz = 0; | |||
| 810 | size_t io_idx = 0; /* Index into iovec workqueue. */ | |||
| 811 | size_t xfer_sz = 0; /* Total accumulated io bytes. */ | |||
| 812 | ||||
| 813 | do { | |||
| 814 | iov = &io_v[io_idx]; | |||
| 815 | ||||
| 816 | /* | |||
| 817 | * Reads require writable descriptors. Writes require | |||
| 818 | * non-writeable descriptors. | |||
| 819 | */ | |||
| 820 | if ((!is_write) ^ DESC_WRITABLE(*desc)(((*desc)->flags & 2) ? 1 : 0)) { | |||
| 821 | log_warnx("%s: invalid descriptor for %s command", | |||
| 822 | __func__, is_write ? "write" : "read"); | |||
| 823 | return (-1); | |||
| 824 | } | |||
| 825 | ||||
| 826 | /* Collect the IO segment information. */ | |||
| 827 | iov->iov_len = (size_t)(*desc)->len; | |||
| 828 | iov->iov_base = hvaddr_mem((*desc)->addr, iov->iov_len); | |||
| 829 | if (iov->iov_base == NULL((void *)0)) | |||
| 830 | return (-1); | |||
| 831 | ||||
| 832 | /* Move our counters. */ | |||
| 833 | xfer_sz += iov->iov_len; | |||
| 834 | io_idx++; | |||
| 835 | ||||
| 836 | /* Guard against infinite chains */ | |||
| 837 | if (io_idx >= nitems(io_v)(sizeof((io_v)) / sizeof((io_v)[0]))) { | |||
| 838 | log_warnx("%s: descriptor table " | |||
| 839 | "invalid", __func__); | |||
| 840 | return (-1); | |||
| 841 | } | |||
| 842 | ||||
| 843 | /* Advance to the next descriptor. */ | |||
| 844 | *desc = &desc_tbl[(*desc)->next & VIOBLK_QUEUE_MASK(128 - 1)]; | |||
| 845 | } while ((*desc)->flags & VRING_DESC_F_NEXT1); | |||
| 846 | ||||
| 847 | /* | |||
| 848 | * Validate the requested block io operation alignment and size. | |||
| 849 | * Checking offset is just an extra caution as it is derived from | |||
| 850 | * a disk sector and is done for completeness in bounds checking. | |||
| 851 | */ | |||
| 852 | if (offset % VIRTIO_BLK_SECTOR_SIZE512 != 0 && | |||
| 853 | xfer_sz % VIRTIO_BLK_SECTOR_SIZE512 != 0) { | |||
| 854 | log_warnx("%s: unaligned read", __func__); | |||
| 855 | return (-1); | |||
| 856 | } | |||
| 857 | if (xfer_sz > SSIZE_MAX0x7fffffffffffffffL) { /* iovec_copyin limit */ | |||
| 858 | log_warnx("%s: invalid %s size: %zu", __func__, | |||
| 859 | is_write ? "write" : "read", xfer_sz); | |||
| 860 | return (-1); | |||
| 861 | } | |||
| 862 | ||||
| 863 | /* Emulate the Read or Write operation. */ | |||
| 864 | if (is_write) | |||
| 865 | sz = dev->file.pwritev(dev->file.p, io_v, io_idx, offset); | |||
| 866 | else | |||
| 867 | sz = dev->file.preadv(dev->file.p, io_v, io_idx, offset); | |||
| 868 | if (sz != (ssize_t)xfer_sz) { | |||
| 869 | log_warnx("%s: %s failure at offset 0x%llx, xfer_sz=%zu, " | |||
| 870 | "sz=%ld", __func__, (is_write ? "write" : "read"), offset, | |||
| 871 | xfer_sz, sz); | |||
| 872 | return (-1); | |||
| 873 | } | |||
| 874 | ||||
| 875 | return (sz); | |||
| 876 | } |