Bug Summary

File:src/usr.sbin/vmd/vm.c
Warning:line 1191, column 3
Potential leak of memory pointed to by 'tid'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name vm.c -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -pic-is-pie -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -target-feature +retpoline-indirect-calls -target-feature +retpoline-indirect-branches -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/usr.sbin/vmd/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/usr.sbin/vmd -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -fdebug-compilation-dir=/usr/src/usr.sbin/vmd/obj -ferror-limit 19 -fwrapv -D_RET_PROTECTOR -ret-protector -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c /usr/src/usr.sbin/vmd/vm.c
1/* $OpenBSD: vm.c,v 1.67 2021/12/30 08:12:23 claudio Exp $ */
2
3/*
4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h> /* PAGE_SIZE */
20#include <sys/types.h>
21#include <sys/ioctl.h>
22#include <sys/queue.h>
23#include <sys/wait.h>
24#include <sys/uio.h>
25#include <sys/stat.h>
26#include <sys/socket.h>
27#include <sys/time.h>
28#include <sys/mman.h>
29
30#include <dev/ic/i8253reg.h>
31#include <dev/isa/isareg.h>
32#include <dev/pci/pcireg.h>
33
34#include <machine/psl.h>
35#include <machine/pte.h>
36#include <machine/specialreg.h>
37#include <machine/vmmvar.h>
38
39#include <net/if.h>
40
41#include <errno(*__errno()).h>
42#include <event.h>
43#include <fcntl.h>
44#include <imsg.h>
45#include <limits.h>
46#include <poll.h>
47#include <pthread.h>
48#include <stddef.h>
49#include <stdio.h>
50#include <stdlib.h>
51#include <string.h>
52#include <unistd.h>
53#include <util.h>
54
55#include "atomicio.h"
56#include "fw_cfg.h"
57#include "i8253.h"
58#include "i8259.h"
59#include "loadfile.h"
60#include "mc146818.h"
61#include "ns8250.h"
62#include "pci.h"
63#include "virtio.h"
64#include "vmd.h"
65#include "vmm.h"
66
67io_fn_t ioports_map[MAX_PORTS65536];
68
69int run_vm(int, int[][VM_MAX_BASE_PER_DISK4], int *,
70 struct vmop_create_params *, struct vcpu_reg_state *);
71void vm_dispatch_vmm(int, short, void *);
72void *event_thread(void *);
73void *vcpu_run_loop(void *);
74int vcpu_exit(struct vm_run_params *);
75int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
76void create_memory_map(struct vm_create_params *);
77int alloc_guest_mem(struct vm_create_params *);
78int vmm_create_vm(struct vm_create_params *);
79void init_emulated_hw(struct vmop_create_params *, int,
80 int[][VM_MAX_BASE_PER_DISK4], int *);
81void restore_emulated_hw(struct vm_create_params *, int, int *,
82 int[][VM_MAX_BASE_PER_DISK4],int);
83void vcpu_exit_inout(struct vm_run_params *);
84int vcpu_exit_eptviolation(struct vm_run_params *);
85uint8_t vcpu_exit_pci(struct vm_run_params *);
86int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
87int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
88int send_vm(int, struct vm_create_params *);
89int dump_send_header(int);
90int dump_vmr(int , struct vm_mem_range *);
91int dump_mem(int, struct vm_create_params *);
92void restore_vmr(int, struct vm_mem_range *);
93void restore_mem(int, struct vm_create_params *);
94int restore_vm_params(int, struct vm_create_params *);
95void pause_vm(struct vm_create_params *);
96void unpause_vm(struct vm_create_params *);
97
98int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
99
100static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
101 size_t);
102
103int con_fd;
104struct vmd_vm *current_vm;
105
106extern struct vmd *env;
107
108extern char *__progname;
109
110pthread_mutex_t threadmutex;
111pthread_cond_t threadcond;
112
113pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM64];
114pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM64];
115pthread_barrier_t vm_pause_barrier;
116pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM64];
117pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM64];
118uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM64];
119uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM64];
120
121/*
122 * Represents a standard register set for an OS to be booted
123 * as a flat 64 bit address space.
124 *
125 * NOT set here are:
126 * RIP
127 * RSP
128 * GDTR BASE
129 *
130 * Specific bootloaders should clone this structure and override
131 * those fields as needed.
132 *
133 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
134 * features of the CPU in use.
135 */
136static const struct vcpu_reg_state vcpu_init_flat64 = {
137 .vrs_gprs[VCPU_REGS_RFLAGS17] = 0x2,
138 .vrs_gprs[VCPU_REGS_RIP16] = 0x0,
139 .vrs_gprs[VCPU_REGS_RSP14] = 0x0,
140 .vrs_crs[VCPU_REGS_CR00] = CR0_ET0x00000010 | CR0_PE0x00000001 | CR0_PG0x80000000,
141 .vrs_crs[VCPU_REGS_CR32] = PML4_PAGE0x11000,
142 .vrs_crs[VCPU_REGS_CR43] = CR4_PAE0x00000020 | CR4_PSE0x00000010,
143 .vrs_crs[VCPU_REGS_PDPTE06] = 0ULL,
144 .vrs_crs[VCPU_REGS_PDPTE17] = 0ULL,
145 .vrs_crs[VCPU_REGS_PDPTE28] = 0ULL,
146 .vrs_crs[VCPU_REGS_PDPTE39] = 0ULL,
147 .vrs_sregs[VCPU_REGS_CS0] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
148 .vrs_sregs[VCPU_REGS_DS1] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
149 .vrs_sregs[VCPU_REGS_ES2] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
150 .vrs_sregs[VCPU_REGS_FS3] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
151 .vrs_sregs[VCPU_REGS_GS4] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
152 .vrs_sregs[VCPU_REGS_SS5] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
153 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
154 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
155 .vrs_sregs[VCPU_REGS_LDTR6] = { 0x0, 0xFFFF, 0x0082, 0x0},
156 .vrs_sregs[VCPU_REGS_TR7] = { 0x0, 0xFFFF, 0x008B, 0x0},
157 .vrs_msrs[VCPU_REGS_EFER0] = EFER_LME0x00000100 | EFER_LMA0x00000400,
158 .vrs_drs[VCPU_REGS_DR00] = 0x0,
159 .vrs_drs[VCPU_REGS_DR11] = 0x0,
160 .vrs_drs[VCPU_REGS_DR22] = 0x0,
161 .vrs_drs[VCPU_REGS_DR33] = 0x0,
162 .vrs_drs[VCPU_REGS_DR64] = 0xFFFF0FF0,
163 .vrs_drs[VCPU_REGS_DR75] = 0x400,
164 .vrs_msrs[VCPU_REGS_STAR1] = 0ULL,
165 .vrs_msrs[VCPU_REGS_LSTAR2] = 0ULL,
166 .vrs_msrs[VCPU_REGS_CSTAR3] = 0ULL,
167 .vrs_msrs[VCPU_REGS_SFMASK4] = 0ULL,
168 .vrs_msrs[VCPU_REGS_KGSBASE5] = 0ULL,
169 .vrs_msrs[VCPU_REGS_MISC_ENABLE6] = 0ULL,
170 .vrs_crs[VCPU_REGS_XCR05] = XCR0_X870x00000001
171};
172
173/*
174 * Represents a standard register set for an BIOS to be booted
175 * as a flat 16 bit address space.
176 */
177static const struct vcpu_reg_state vcpu_init_flat16 = {
178 .vrs_gprs[VCPU_REGS_RFLAGS17] = 0x2,
179 .vrs_gprs[VCPU_REGS_RIP16] = 0xFFF0,
180 .vrs_gprs[VCPU_REGS_RSP14] = 0x0,
181 .vrs_crs[VCPU_REGS_CR00] = 0x60000010,
182 .vrs_crs[VCPU_REGS_CR32] = 0,
183 .vrs_sregs[VCPU_REGS_CS0] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
184 .vrs_sregs[VCPU_REGS_DS1] = { 0x0, 0xFFFF, 0x8093, 0x0},
185 .vrs_sregs[VCPU_REGS_ES2] = { 0x0, 0xFFFF, 0x8093, 0x0},
186 .vrs_sregs[VCPU_REGS_FS3] = { 0x0, 0xFFFF, 0x8093, 0x0},
187 .vrs_sregs[VCPU_REGS_GS4] = { 0x0, 0xFFFF, 0x8093, 0x0},
188 .vrs_sregs[VCPU_REGS_SS5] = { 0x0, 0xFFFF, 0x8093, 0x0},
189 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
190 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
191 .vrs_sregs[VCPU_REGS_LDTR6] = { 0x0, 0xFFFF, 0x0082, 0x0},
192 .vrs_sregs[VCPU_REGS_TR7] = { 0x0, 0xFFFF, 0x008B, 0x0},
193 .vrs_msrs[VCPU_REGS_EFER0] = 0ULL,
194 .vrs_drs[VCPU_REGS_DR00] = 0x0,
195 .vrs_drs[VCPU_REGS_DR11] = 0x0,
196 .vrs_drs[VCPU_REGS_DR22] = 0x0,
197 .vrs_drs[VCPU_REGS_DR33] = 0x0,
198 .vrs_drs[VCPU_REGS_DR64] = 0xFFFF0FF0,
199 .vrs_drs[VCPU_REGS_DR75] = 0x400,
200 .vrs_msrs[VCPU_REGS_STAR1] = 0ULL,
201 .vrs_msrs[VCPU_REGS_LSTAR2] = 0ULL,
202 .vrs_msrs[VCPU_REGS_CSTAR3] = 0ULL,
203 .vrs_msrs[VCPU_REGS_SFMASK4] = 0ULL,
204 .vrs_msrs[VCPU_REGS_KGSBASE5] = 0ULL,
205 .vrs_crs[VCPU_REGS_XCR05] = XCR0_X870x00000001
206};
207
208/*
209 * loadfile_bios
210 *
211 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
212 * directly into memory.
213 *
214 * Parameters:
215 * fp: file of a kernel file to load
216 * size: uncompressed size of the image
217 * (out) vrs: register state to set on init for this kernel
218 *
219 * Return values:
220 * 0 if successful
221 * various error codes returned from read(2) or loadelf functions
222 */
223int
224loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
225{
226 off_t off;
227
228 /* Set up a "flat 16 bit" register state for BIOS */
229 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
230
231 /* Seek to the beginning of the BIOS image */
232 if (gzseek(fp, 0, SEEK_SET0) == -1)
233 return (-1);
234
235 /* The BIOS image must end at 1M */
236 if ((off = 1048576 - size) < 0)
237 return (-1);
238
239 /* Read BIOS image into memory */
240 if (mread(fp, off, size) != (size_t)size) {
241 errno(*__errno()) = EIO5;
242 return (-1);
243 }
244
245 log_debug("%s: loaded BIOS image", __func__);
246
247 return (0);
248}
249
250/*
251 * start_vm
252 *
253 * After forking a new VM process, starts the new VM with the creation
254 * parameters supplied (in the incoming vm->vm_params field). This
255 * function performs a basic sanity check on the incoming parameters
256 * and then performs the following steps to complete the creation of the VM:
257 *
258 * 1. validates and create the new VM
259 * 2. opens the imsg control channel to the parent and drops more privilege
260 * 3. drops additional privleges by calling pledge(2)
261 * 4. loads the kernel from the disk image or file descriptor
262 * 5. runs the VM's VCPU loops.
263 *
264 * Parameters:
265 * vm: The VM data structure that is including the VM create parameters.
266 * fd: The imsg socket that is connected to the parent process.
267 *
268 * Return values:
269 * 0: success
270 * !0 : failure - typically an errno indicating the source of the failure
271 */
272int
273start_vm(struct vmd_vm *vm, int fd)
274{
275 struct vmop_create_params *vmc = &vm->vm_params;
276 struct vm_create_params *vcp = &vmc->vmc_params;
277 struct vcpu_reg_state vrs;
278 int nicfds[VMM_MAX_NICS_PER_VM4];
279 int ret;
280 gzFile fp;
281 size_t i;
282 struct vm_rwregs_params vrp;
283 struct stat sb;
284
285 /* Child */
286 setproctitle("%s", vcp->vcp_name);
287 log_procinit(vcp->vcp_name);
288
289 if (!(vm->vm_state & VM_STATE_RECEIVED0x08))
290 create_memory_map(vcp);
291
292 ret = alloc_guest_mem(vcp);
293
294 if (ret) {
295 errno(*__errno()) = ret;
296 fatal("could not allocate guest memory - exiting");
297 }
298
299 ret = vmm_create_vm(vcp);
300 current_vm = vm;
301
302 /* send back the kernel-generated vm id (0 on error) */
303 if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
304 sizeof(vcp->vcp_id))
305 fatal("write vcp id");
306
307 if (ret) {
308 errno(*__errno()) = ret;
309 fatal("create vmm ioctl failed - exiting");
310 }
311
312 /*
313 * pledge in the vm processes:
314 * stdio - for malloc and basic I/O including events.
315 * recvfd - for send/recv.
316 * vmm - for the vmm ioctls and operations.
317 */
318 if (pledge("stdio vmm recvfd", NULL((void*)0)) == -1)
319 fatal("pledge");
320
321 if (vm->vm_state & VM_STATE_RECEIVED0x08) {
322 ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp));
323 if (ret != sizeof(vrp)) {
324 fatal("received incomplete vrp - exiting");
325 }
326 vrs = vrp.vrwp_regs;
327 } else {
328 /*
329 * Set up default "flat 64 bit" register state - RIP,
330 * RSP, and GDT info will be set in bootloader
331 */
332 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
333
334 /* Find and open kernel image */
335 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL((void*)0))
336 fatalx("failed to open kernel - exiting");
337
338 /* Load kernel image */
339 ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice);
340
341 /*
342 * Try BIOS as a fallback (only if it was provided as an image
343 * with vm->vm_kernel and the file is not compressed)
344 */
345 if (ret && errno(*__errno()) == ENOEXEC8 && vm->vm_kernel != -1 &&
346 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
347 ret = loadfile_bios(fp, sb.st_size, &vrs);
348
349 if (ret)
350 fatal("failed to load kernel or BIOS - exiting");
351
352 gzclose(fp);
353 }
354
355 if (vm->vm_kernel != -1)
356 close(vm->vm_kernel);
357
358 con_fd = vm->vm_tty;
359 if (fcntl(con_fd, F_SETFL4, O_NONBLOCK0x0004) == -1)
360 fatal("failed to set nonblocking mode on console");
361
362 for (i = 0; i < VMM_MAX_NICS_PER_VM4; i++)
363 nicfds[i] = vm->vm_ifs[i].vif_fd;
364
365 event_init();
366
367 if (vm->vm_state & VM_STATE_RECEIVED0x08) {
368 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
369 vm->vm_disks, vm->vm_cdrom);
370 restore_mem(vm->vm_receive_fd, vcp);
371 if (restore_vm_params(vm->vm_receive_fd, vcp))
372 fatal("restore vm params failed");
373 unpause_vm(vcp);
374 }
375
376 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
377 fatal("setup vm pipe");
378
379 /* Execute the vcpu run loop(s) for this VM */
380 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs);
381
382 /* Ensure that any in-flight data is written back */
383 virtio_shutdown(vm);
384
385 return (ret);
386}
387
388/*
389 * vm_dispatch_vmm
390 *
391 * imsg callback for messages that are received from the vmm parent process.
392 */
393void
394vm_dispatch_vmm(int fd, short event, void *arg)
395{
396 struct vmd_vm *vm = arg;
397 struct vmop_result vmr;
398 struct vmop_addr_result var;
399 struct imsgev *iev = &vm->vm_iev;
400 struct imsgbuf *ibuf = &iev->ibuf;
401 struct imsg imsg;
402 ssize_t n;
403 int verbose;
404
405 if (event & EV_READ0x02) {
406 if ((n = imsg_read(ibuf)) == -1 && errno(*__errno()) != EAGAIN35)
407 fatal("%s: imsg_read", __func__);
408 if (n == 0)
409 _exit(0);
410 }
411
412 if (event & EV_WRITE0x04) {
413 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno(*__errno()) != EAGAIN35)
414 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
415 if (n == 0)
416 _exit(0);
417 }
418
419 for (;;) {
420 if ((n = imsg_get(ibuf, &imsg)) == -1)
421 fatal("%s: imsg_get", __func__);
422 if (n == 0)
423 break;
424
425#if DEBUG > 1
426 log_debug("%s: got imsg %d from %s",
427 __func__, imsg.hdr.type,
428 vm->vm_params.vmc_params.vcp_name);
429#endif
430
431 switch (imsg.hdr.type) {
432 case IMSG_CTL_VERBOSE:
433 IMSG_SIZE_CHECK(&imsg, &verbose)do { if (((&imsg)->hdr.len - sizeof(struct imsg_hdr)) <
sizeof(*&verbose)) fatalx("bad length imsg received (%s)"
, "&verbose"); } while (0)
;
434 memcpy(&verbose, imsg.data, sizeof(verbose));
435 log_setverbose(verbose);
436 break;
437 case IMSG_VMDOP_VM_SHUTDOWN:
438 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
439 _exit(0);
440 break;
441 case IMSG_VMDOP_VM_REBOOT:
442 if (vmmci_ctl(VMMCI_REBOOT) == -1)
443 _exit(0);
444 break;
445 case IMSG_VMDOP_PAUSE_VM:
446 vmr.vmr_result = 0;
447 vmr.vmr_id = vm->vm_vmid;
448 pause_vm(&vm->vm_params.vmc_params);
449 imsg_compose_event(&vm->vm_iev,
450 IMSG_VMDOP_PAUSE_VM_RESPONSE,
451 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
452 sizeof(vmr));
453 break;
454 case IMSG_VMDOP_UNPAUSE_VM:
455 vmr.vmr_result = 0;
456 vmr.vmr_id = vm->vm_vmid;
457 unpause_vm(&vm->vm_params.vmc_params);
458 imsg_compose_event(&vm->vm_iev,
459 IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
460 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
461 sizeof(vmr));
462 break;
463 case IMSG_VMDOP_SEND_VM_REQUEST:
464 vmr.vmr_id = vm->vm_vmid;
465 vmr.vmr_result = send_vm(imsg.fd,
466 &vm->vm_params.vmc_params);
467 imsg_compose_event(&vm->vm_iev,
468 IMSG_VMDOP_SEND_VM_RESPONSE,
469 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
470 sizeof(vmr));
471 if (!vmr.vmr_result) {
472 imsg_flush(&current_vm->vm_iev.ibuf);
473 _exit(0);
474 }
475 break;
476 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
477 IMSG_SIZE_CHECK(&imsg, &var)do { if (((&imsg)->hdr.len - sizeof(struct imsg_hdr)) <
sizeof(*&var)) fatalx("bad length imsg received (%s)", "&var"
); } while (0)
;
478 memcpy(&var, imsg.data, sizeof(var));
479
480 log_debug("%s: received tap addr %s for nic %d",
481 vm->vm_params.vmc_params.vcp_name,
482 ether_ntoa((void *)var.var_addr), var.var_nic_idx);
483
484 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
485 break;
486 default:
487 fatalx("%s: got invalid imsg %d from %s",
488 __func__, imsg.hdr.type,
489 vm->vm_params.vmc_params.vcp_name);
490 }
491 imsg_free(&imsg);
492 }
493 imsg_event_add(iev);
494}
495
496/*
497 * vm_shutdown
498 *
499 * Tell the vmm parent process to shutdown or reboot the VM and exit.
500 */
501__dead__attribute__((__noreturn__)) void
502vm_shutdown(unsigned int cmd)
503{
504 switch (cmd) {
505 case VMMCI_NONE:
506 case VMMCI_SHUTDOWN:
507 (void)imsg_compose_event(&current_vm->vm_iev,
508 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL((void*)0), 0);
509 break;
510 case VMMCI_REBOOT:
511 (void)imsg_compose_event(&current_vm->vm_iev,
512 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL((void*)0), 0);
513 break;
514 default:
515 fatalx("invalid vm ctl command: %d", cmd);
516 }
517 imsg_flush(&current_vm->vm_iev.ibuf);
518
519 _exit(0);
520}
521
522int
523send_vm(int fd, struct vm_create_params *vcp)
524{
525 struct vm_rwregs_params vrp;
526 struct vm_rwvmparams_params vpp;
527 struct vmop_create_params *vmc;
528 struct vm_terminate_params vtp;
529 unsigned int flags = 0;
530 unsigned int i;
531 int ret = 0;
532 size_t sz;
533
534 if (dump_send_header(fd)) {
535 log_info("%s: failed to send vm dump header", __func__);
536 goto err;
537 }
538
539 pause_vm(vcp);
540
541 vmc = calloc(1, sizeof(struct vmop_create_params));
542 if (vmc == NULL((void*)0)) {
543 log_warn("%s: calloc error geting vmc", __func__);
544 ret = -1;
545 goto err;
546 }
547
548 flags |= VMOP_CREATE_MEMORY0x04;
549 memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
550 vmop_create_params));
551 vmc->vmc_flags = flags;
552 vrp.vrwp_vm_id = vcp->vcp_id;
553 vrp.vrwp_mask = VM_RWREGS_ALL(0x1 | 0x2 | 0x4 | 0x8 | 0x10);
554 vpp.vpp_mask = VM_RWVMPARAMS_ALL(0x1 | 0x2);
555 vpp.vpp_vm_id = vcp->vcp_id;
556
557 sz = atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, vmc,sizeof(struct vmop_create_params));
558 if (sz != sizeof(struct vmop_create_params)) {
559 ret = -1;
560 goto err;
561 }
562
563 for (i = 0; i < vcp->vcp_ncpus; i++) {
564 vrp.vrwp_vcpu_id = i;
565 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof
(struct vm_rwregs_params) & 0x1fff) << 16) | ((('V'
)) << 8) | ((7)))
, &vrp))) {
566 log_warn("%s: readregs failed", __func__);
567 goto err;
568 }
569
570 sz = atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, &vrp,
571 sizeof(struct vm_rwregs_params));
572 if (sz != sizeof(struct vm_rwregs_params)) {
573 log_warn("%s: dumping registers failed", __func__);
574 ret = -1;
575 goto err;
576 }
577 }
578
579 if ((ret = i8253_dump(fd)))
580 goto err;
581 if ((ret = i8259_dump(fd)))
582 goto err;
583 if ((ret = ns8250_dump(fd)))
584 goto err;
585 if ((ret = mc146818_dump(fd)))
586 goto err;
587 if ((ret = fw_cfg_dump(fd)))
588 goto err;
589 if ((ret = pci_dump(fd)))
590 goto err;
591 if ((ret = virtio_dump(fd)))
592 goto err;
593 if ((ret = dump_mem(fd, vcp)))
594 goto err;
595
596 for (i = 0; i < vcp->vcp_ncpus; i++) {
597 vpp.vpp_vcpu_id = i;
598 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof
(struct vm_rwvmparams_params) & 0x1fff) << 16) | ((
('V')) << 8) | ((9)))
, &vpp))) {
599 log_warn("%s: readvmparams failed", __func__);
600 goto err;
601 }
602
603 sz = atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, &vpp,
604 sizeof(struct vm_rwvmparams_params));
605 if (sz != sizeof(struct vm_rwvmparams_params)) {
606 log_warn("%s: dumping vm params failed", __func__);
607 ret = -1;
608 goto err;
609 }
610 }
611
612 vtp.vtp_vm_id = vcp->vcp_id;
613 if (ioctl(env->vmd_fd, VMM_IOC_TERM((unsigned long)0x80000000 | ((sizeof(struct vm_terminate_params
) & 0x1fff) << 16) | ((('V')) << 8) | ((4)))
, &vtp) == -1) {
614 log_warnx("%s: term IOC error: %d, %d", __func__,
615 errno(*__errno()), ENOENT2);
616 }
617err:
618 close(fd);
619 if (ret)
620 unpause_vm(vcp);
621 return ret;
622}
623
624int
625dump_send_header(int fd) {
626 struct vm_dump_header vmh;
627 int i;
628
629 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE"OpenBSDVMM58",
630 sizeof(vmh.vmh_signature));
631
632 vmh.vmh_cpuids[0].code = 0x00;
633 vmh.vmh_cpuids[0].leaf = 0x00;
634
635 vmh.vmh_cpuids[1].code = 0x01;
636 vmh.vmh_cpuids[1].leaf = 0x00;
637
638 vmh.vmh_cpuids[2].code = 0x07;
639 vmh.vmh_cpuids[2].leaf = 0x00;
640
641 vmh.vmh_cpuids[3].code = 0x0d;
642 vmh.vmh_cpuids[3].leaf = 0x00;
643
644 vmh.vmh_cpuids[4].code = 0x80000001;
645 vmh.vmh_cpuids[4].leaf = 0x00;
646
647 vmh.vmh_version = VM_DUMP_VERSION7;
648
649 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT5; i++) {
650 CPUID_LEAF(vmh.vmh_cpuids[i].code,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh
.vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids
[i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i]
.leaf))
651 vmh.vmh_cpuids[i].leaf,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh
.vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids
[i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i]
.leaf))
652 vmh.vmh_cpuids[i].a,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh
.vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids
[i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i]
.leaf))
653 vmh.vmh_cpuids[i].b,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh
.vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids
[i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i]
.leaf))
654 vmh.vmh_cpuids[i].c,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh
.vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids
[i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i]
.leaf))
655 vmh.vmh_cpuids[i].d)__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh
.vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids
[i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i]
.leaf))
;
656 }
657
658 if (atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
659 return (-1);
660
661 return (0);
662}
663
664int
665dump_mem(int fd, struct vm_create_params *vcp)
666{
667 unsigned int i;
668 int ret;
669 struct vm_mem_range *vmr;
670
671 for (i = 0; i < vcp->vcp_nmemranges; i++) {
672 vmr = &vcp->vcp_memranges[i];
673 ret = dump_vmr(fd, vmr);
674 if (ret)
675 return ret;
676 }
677 return (0);
678}
679
680int
681restore_vm_params(int fd, struct vm_create_params *vcp) {
682 unsigned int i;
683 struct vm_rwvmparams_params vpp;
684
685 for (i = 0; i < vcp->vcp_ncpus; i++) {
686 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
687 log_warn("%s: error restoring vm params", __func__);
688 return (-1);
689 }
690 vpp.vpp_vm_id = vcp->vcp_id;
691 vpp.vpp_vcpu_id = i;
692 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS((unsigned long)0x80000000 | ((sizeof(struct vm_rwvmparams_params
) & 0x1fff) << 16) | ((('V')) << 8) | ((10)))
, &vpp) < 0) {
693 log_debug("%s: writing vm params failed", __func__);
694 return (-1);
695 }
696 }
697 return (0);
698}
699
700void
701restore_mem(int fd, struct vm_create_params *vcp)
702{
703 unsigned int i;
704 struct vm_mem_range *vmr;
705
706 for (i = 0; i < vcp->vcp_nmemranges; i++) {
707 vmr = &vcp->vcp_memranges[i];
708 restore_vmr(fd, vmr);
709 }
710}
711
712int
713dump_vmr(int fd, struct vm_mem_range *vmr)
714{
715 size_t rem = vmr->vmr_size, read=0;
716 char buf[PAGE_SIZE(1 << 12)];
717
718 while (rem > 0) {
719 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE(1 << 12))) {
720 log_warn("failed to read vmr");
721 return (-1);
722 }
723 if (atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, buf, sizeof(buf)) != sizeof(buf)) {
724 log_warn("failed to dump vmr");
725 return (-1);
726 }
727 rem = rem - PAGE_SIZE(1 << 12);
728 read = read + PAGE_SIZE(1 << 12);
729 }
730 return (0);
731}
732
733void
734restore_vmr(int fd, struct vm_mem_range *vmr)
735{
736 size_t rem = vmr->vmr_size, wrote=0;
737 char buf[PAGE_SIZE(1 << 12)];
738
739 while (rem > 0) {
740 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
741 fatal("failed to restore vmr");
742 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE(1 << 12)))
743 fatal("failed to write vmr");
744 rem = rem - PAGE_SIZE(1 << 12);
745 wrote = wrote + PAGE_SIZE(1 << 12);
746 }
747}
748
749void
750pause_vm(struct vm_create_params *vcp)
751{
752 unsigned int n;
753 int ret;
754 if (current_vm->vm_state & VM_STATE_PAUSED0x10)
755 return;
756
757 current_vm->vm_state |= VM_STATE_PAUSED0x10;
758
759 ret = pthread_barrier_init(&vm_pause_barrier, NULL((void*)0), vcp->vcp_ncpus + 1);
760 if (ret) {
761 log_warnx("%s: cannot initialize pause barrier (%d)",
762 __progname, ret);
763 return;
764 }
765
766 for (n = 0; n < vcp->vcp_ncpus; n++) {
767 ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
768 if (ret) {
769 log_warnx("%s: can't broadcast vcpu run cond (%d)",
770 __func__, (int)ret);
771 return;
772 }
773 }
774 ret = pthread_barrier_wait(&vm_pause_barrier);
775 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD-1) {
776 log_warnx("%s: could not wait on pause barrier (%d)",
777 __func__, (int)ret);
778 return;
779 }
780
781 ret = pthread_barrier_destroy(&vm_pause_barrier);
782 if (ret) {
783 log_warnx("%s: could not destroy pause barrier (%d)",
784 __progname, ret);
785 return;
786 }
787
788 i8253_stop();
789 mc146818_stop();
790 ns8250_stop();
791 virtio_stop(vcp);
792}
793
794void
795unpause_vm(struct vm_create_params *vcp)
796{
797 unsigned int n;
798 int ret;
799 if (!(current_vm->vm_state & VM_STATE_PAUSED0x10))
800 return;
801
802 current_vm->vm_state &= ~VM_STATE_PAUSED0x10;
803 for (n = 0; n < vcp->vcp_ncpus; n++) {
804 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
805 if (ret) {
806 log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
807 __func__, (int)ret);
808 return;
809 }
810 }
811
812 i8253_start();
813 mc146818_start();
814 ns8250_start();
815 virtio_start(vcp);
816}
817
818/*
819 * vcpu_reset
820 *
821 * Requests vmm(4) to reset the VCPUs in the indicated VM to
822 * the register state provided
823 *
824 * Parameters
825 * vmid: VM ID to reset
826 * vcpu_id: VCPU ID to reset
827 * vrs: the register state to initialize
828 *
829 * Return values:
830 * 0: success
831 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
832 * valid)
833 */
834int
835vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
836{
837 struct vm_resetcpu_params vrp;
838
839 memset(&vrp, 0, sizeof(vrp));
840 vrp.vrp_vm_id = vmid;
841 vrp.vrp_vcpu_id = vcpu_id;
842 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
843
844 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
845
846 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU((unsigned long)0x80000000 | ((sizeof(struct vm_resetcpu_params
) & 0x1fff) << 16) | ((('V')) << 8) | ((5)))
, &vrp) == -1)
847 return (errno(*__errno()));
848
849 return (0);
850}
851
852/*
853 * create_memory_map
854 *
855 * Sets up the guest physical memory ranges that the VM can access.
856 *
857 * Parameters:
858 * vcp: VM create parameters describing the VM whose memory map
859 * is being created
860 *
861 * Return values:
862 * nothing
863 */
864void
865create_memory_map(struct vm_create_params *vcp)
866{
867 size_t len, mem_bytes, mem_mb;
868
869 mem_mb = vcp->vcp_memranges[0].vmr_size;
870 vcp->vcp_nmemranges = 0;
871 if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE32768)
872 return;
873
874 mem_bytes = mem_mb * 1024 * 1024;
875
876 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */
877 len = LOWMEM_KB640 * 1024;
878 vcp->vcp_memranges[0].vmr_gpa = 0x0;
879 vcp->vcp_memranges[0].vmr_size = len;
880 mem_bytes -= len;
881
882 /*
883 * Second memory region: LOWMEM_KB - 1MB.
884 *
885 * N.B. - Normally ROMs or parts of video RAM are mapped here.
886 * We have to add this region, because some systems
887 * unconditionally write to 0xb8000 (VGA RAM), and
888 * we need to make sure that vmm(4) permits accesses
889 * to it. So allocate guest memory for it.
890 */
891 len = 0x100000 - LOWMEM_KB640 * 1024;
892 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB640 * 1024;
893 vcp->vcp_memranges[1].vmr_size = len;
894 mem_bytes -= len;
895
896 /* Make sure that we do not place physical memory into MMIO ranges. */
897 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE0xF0000000ULL - 0x100000)
898 len = VMM_PCI_MMIO_BAR_BASE0xF0000000ULL - 0x100000;
899 else
900 len = mem_bytes;
901
902 /* Third memory region: 1MB - (1MB + len) */
903 vcp->vcp_memranges[2].vmr_gpa = 0x100000;
904 vcp->vcp_memranges[2].vmr_size = len;
905 mem_bytes -= len;
906
907 if (mem_bytes > 0) {
908 /* Fourth memory region for the remaining memory (if any) */
909 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END0xFFFFFFFFULL + 1;
910 vcp->vcp_memranges[3].vmr_size = mem_bytes;
911 vcp->vcp_nmemranges = 4;
912 } else
913 vcp->vcp_nmemranges = 3;
914}
915
916/*
917 * alloc_guest_mem
918 *
919 * Allocates memory for the guest.
920 * Instead of doing a single allocation with one mmap(), we allocate memory
921 * separately for every range for the following reasons:
922 * - ASLR for the individual ranges
923 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
924 * map the single mmap'd userspace memory to the individual guest physical
925 * memory ranges, the underlying amap of the single mmap'd range would have
926 * to allocate per-page reference counters. The reason is that the
927 * individual guest physical ranges would reference the single mmap'd region
928 * only partially. However, if every guest physical range has its own
929 * corresponding mmap'd userspace allocation, there are no partial
930 * references: every guest physical range fully references an mmap'd
931 * range => no per-page reference counters have to be allocated.
932 *
933 * Return values:
934 * 0: success
935 * !0: failure - errno indicating the source of the failure
936 */
937int
938alloc_guest_mem(struct vm_create_params *vcp)
939{
940 void *p;
941 int ret;
942 size_t i, j;
943 struct vm_mem_range *vmr;
944
945 for (i = 0; i < vcp->vcp_nmemranges; i++) {
946 vmr = &vcp->vcp_memranges[i];
947 p = mmap(NULL((void*)0), vmr->vmr_size, PROT_READ0x01 | PROT_WRITE0x02,
948 MAP_PRIVATE0x0002 | MAP_ANON0x1000, -1, 0);
949 if (p == MAP_FAILED((void *)-1)) {
950 ret = errno(*__errno());
951 for (j = 0; j < i; j++) {
952 vmr = &vcp->vcp_memranges[j];
953 munmap((void *)vmr->vmr_va, vmr->vmr_size);
954 }
955
956 return (ret);
957 }
958
959 vmr->vmr_va = (vaddr_t)p;
960 }
961
962 return (0);
963}
964
965/*
966 * vmm_create_vm
967 *
968 * Requests vmm(4) to create a new VM using the supplied creation
969 * parameters. This operation results in the creation of the in-kernel
970 * structures for the VM, but does not start the VM's vcpu(s).
971 *
972 * Parameters:
973 * vcp: vm_create_params struct containing the VM's desired creation
974 * configuration
975 *
976 * Return values:
977 * 0: success
978 * !0 : ioctl to vmm(4) failed
979 */
980int
981vmm_create_vm(struct vm_create_params *vcp)
982{
983 /* Sanity check arguments */
984 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM64)
985 return (EINVAL22);
986
987 if (vcp->vcp_nmemranges == 0 ||
988 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES16)
989 return (EINVAL22);
990
991 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM4)
992 return (EINVAL22);
993
994 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM4)
995 return (EINVAL22);
996
997 if (ioctl(env->vmd_fd, VMM_IOC_CREATE(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof
(struct vm_create_params) & 0x1fff) << 16) | ((('V'
)) << 8) | ((1)))
, vcp) == -1)
998 return (errno(*__errno()));
999
1000 return (0);
1001}
1002
1003/*
1004 * init_emulated_hw
1005 *
1006 * Initializes the userspace hardware emulation
1007 */
1008void
1009init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1010 int child_disks[][VM_MAX_BASE_PER_DISK4], int *child_taps)
1011{
1012 struct vm_create_params *vcp = &vmc->vmc_params;
1013 int i;
1014 uint64_t memlo, memhi;
1015
1016 /* Calculate memory size for NVRAM registers */
1017 memlo = memhi = 0;
1018 if (vcp->vcp_nmemranges > 2)
1019 memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
1020
1021 if (vcp->vcp_nmemranges > 3)
1022 memhi = vcp->vcp_memranges[3].vmr_size;
1023
1024 /* Reset the IO port map */
1025 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS65536);
1026
1027 /* Init i8253 PIT */
1028 i8253_init(vcp->vcp_id);
1029 ioports_map[TIMER_CTRL0x43] = vcpu_exit_i8253;
1030 ioports_map[TIMER_BASE0x40 + TIMER_CNTR00] = vcpu_exit_i8253;
1031 ioports_map[TIMER_BASE0x40 + TIMER_CNTR11] = vcpu_exit_i8253;
1032 ioports_map[TIMER_BASE0x40 + TIMER_CNTR22] = vcpu_exit_i8253;
1033 ioports_map[PCKBC_AUX0x61] = vcpu_exit_i8253_misc;
1034
1035 /* Init mc146818 RTC */
1036 mc146818_init(vcp->vcp_id, memlo, memhi);
1037 ioports_map[IO_RTC0x070] = vcpu_exit_mc146818;
1038 ioports_map[IO_RTC0x070 + 1] = vcpu_exit_mc146818;
1039
1040 /* Init master and slave PICs */
1041 i8259_init();
1042 ioports_map[IO_ICU10x020] = vcpu_exit_i8259;
1043 ioports_map[IO_ICU10x020 + 1] = vcpu_exit_i8259;
1044 ioports_map[IO_ICU20x0A0] = vcpu_exit_i8259;
1045 ioports_map[IO_ICU20x0A0 + 1] = vcpu_exit_i8259;
1046 ioports_map[ELCR00x4D0] = vcpu_exit_elcr;
1047 ioports_map[ELCR10x4D1] = vcpu_exit_elcr;
1048
1049 /* Init ns8250 UART */
1050 ns8250_init(con_fd, vcp->vcp_id);
1051 for (i = COM1_DATA0x3f8 +0; i <= COM1_SCR0x3f8 +7; i++)
1052 ioports_map[i] = vcpu_exit_com;
1053
1054 /* Init QEMU fw_cfg interface */
1055 fw_cfg_init(vmc);
1056 ioports_map[FW_CFG_IO_SELECT0x510] = vcpu_exit_fw_cfg;
1057 ioports_map[FW_CFG_IO_DATA0x511] = vcpu_exit_fw_cfg;
1058 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH0x514] = vcpu_exit_fw_cfg_dma;
1059 ioports_map[FW_CFG_IO_DMA_ADDR_LOW0x518] = vcpu_exit_fw_cfg_dma;
1060
1061 /* Initialize PCI */
1062 for (i = VMM_PCI_IO_BAR_BASE0x1000; i <= VMM_PCI_IO_BAR_END0xFFFF; i++)
1063 ioports_map[i] = vcpu_exit_pci;
1064
1065 ioports_map[PCI_MODE1_ADDRESS_REG0x0cf8] = vcpu_exit_pci;
1066 ioports_map[PCI_MODE1_DATA_REG0x0cfc] = vcpu_exit_pci;
1067 ioports_map[PCI_MODE1_DATA_REG0x0cfc + 1] = vcpu_exit_pci;
1068 ioports_map[PCI_MODE1_DATA_REG0x0cfc + 2] = vcpu_exit_pci;
1069 ioports_map[PCI_MODE1_DATA_REG0x0cfc + 3] = vcpu_exit_pci;
1070 pci_init();
1071
1072 /* Initialize virtio devices */
1073 virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1074}
1075/*
1076 * restore_emulated_hw
1077 *
1078 * Restores the userspace hardware emulation from fd
1079 */
1080void
1081restore_emulated_hw(struct vm_create_params *vcp, int fd,
1082 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK4], int child_cdrom)
1083{
1084 /* struct vm_create_params *vcp = &vmc->vmc_params; */
1085 int i;
1086 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS65536);
1087
1088 /* Init i8253 PIT */
1089 i8253_restore(fd, vcp->vcp_id);
1090 ioports_map[TIMER_CTRL0x43] = vcpu_exit_i8253;
1091 ioports_map[TIMER_BASE0x40 + TIMER_CNTR00] = vcpu_exit_i8253;
1092 ioports_map[TIMER_BASE0x40 + TIMER_CNTR11] = vcpu_exit_i8253;
1093 ioports_map[TIMER_BASE0x40 + TIMER_CNTR22] = vcpu_exit_i8253;
1094
1095 /* Init master and slave PICs */
1096 i8259_restore(fd);
1097 ioports_map[IO_ICU10x020] = vcpu_exit_i8259;
1098 ioports_map[IO_ICU10x020 + 1] = vcpu_exit_i8259;
1099 ioports_map[IO_ICU20x0A0] = vcpu_exit_i8259;
1100 ioports_map[IO_ICU20x0A0 + 1] = vcpu_exit_i8259;
1101
1102 /* Init ns8250 UART */
1103 ns8250_restore(fd, con_fd, vcp->vcp_id);
1104 for (i = COM1_DATA0x3f8 +0; i <= COM1_SCR0x3f8 +7; i++)
1105 ioports_map[i] = vcpu_exit_com;
1106
1107 /* Init mc146818 RTC */
1108 mc146818_restore(fd, vcp->vcp_id);
1109 ioports_map[IO_RTC0x070] = vcpu_exit_mc146818;
1110 ioports_map[IO_RTC0x070 + 1] = vcpu_exit_mc146818;
1111
1112 /* Init QEMU fw_cfg interface */
1113 fw_cfg_restore(fd);
1114 ioports_map[FW_CFG_IO_SELECT0x510] = vcpu_exit_fw_cfg;
1115 ioports_map[FW_CFG_IO_DATA0x511] = vcpu_exit_fw_cfg;
1116 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH0x514] = vcpu_exit_fw_cfg_dma;
1117 ioports_map[FW_CFG_IO_DMA_ADDR_LOW0x518] = vcpu_exit_fw_cfg_dma;
1118
1119 /* Initialize PCI */
1120 for (i = VMM_PCI_IO_BAR_BASE0x1000; i <= VMM_PCI_IO_BAR_END0xFFFF; i++)
1121 ioports_map[i] = vcpu_exit_pci;
1122
1123 ioports_map[PCI_MODE1_ADDRESS_REG0x0cf8] = vcpu_exit_pci;
1124 ioports_map[PCI_MODE1_DATA_REG0x0cfc] = vcpu_exit_pci;
1125 ioports_map[PCI_MODE1_DATA_REG0x0cfc + 1] = vcpu_exit_pci;
1126 ioports_map[PCI_MODE1_DATA_REG0x0cfc + 2] = vcpu_exit_pci;
1127 ioports_map[PCI_MODE1_DATA_REG0x0cfc + 3] = vcpu_exit_pci;
1128 pci_restore(fd);
1129 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1130}
1131
1132/*
1133 * run_vm
1134 *
1135 * Runs the VM whose creation parameters are specified in vcp
1136 *
1137 * Parameters:
1138 * child_cdrom: previously-opened child ISO disk file descriptor
1139 * child_disks: previously-opened child VM disk file file descriptors
1140 * child_taps: previously-opened child tap file descriptors
1141 * vmc: vmop_create_params struct containing the VM's desired creation
1142 * configuration
1143 * vrs: VCPU register state to initialize
1144 *
1145 * Return values:
1146 * 0: the VM exited normally
1147 * !0 : the VM exited abnormally or failed to start
1148 */
1149int
1150run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK4],
1151 int *child_taps, struct vmop_create_params *vmc,
1152 struct vcpu_reg_state *vrs)
1153{
1154 struct vm_create_params *vcp = &vmc->vmc_params;
1155 struct vm_rwregs_params vregsp;
1156 uint8_t evdone = 0;
1157 size_t i;
1158 int ret;
1159 pthread_t *tid, evtid;
1160 struct vm_run_params **vrp;
1161 void *exit_status;
1162
1163 if (vcp == NULL((void*)0))
1
Assuming 'vcp' is not equal to NULL
2
Taking false branch
1164 return (EINVAL22);
1165
1166 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom))
3
Assuming the condition is false
1167 return (EINVAL22);
1168
1169 if (child_disks == NULL((void*)0) && vcp->vcp_ndisks != 0)
4
Assuming 'child_disks' is not equal to NULL
1170 return (EINVAL22);
1171
1172 if (child_taps == NULL((void*)0) && vcp->vcp_nnics != 0)
5
Assuming 'child_taps' is not equal to NULL
1173 return (EINVAL22);
1174
1175 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM64)
6
Assuming field 'vcp_ncpus' is <= VMM_MAX_VCPUS_PER_VM
7
Taking false branch
1176 return (EINVAL22);
1177
1178 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM4)
8
Assuming field 'vcp_ndisks' is <= VMM_MAX_DISKS_PER_VM
9
Taking false branch
1179 return (EINVAL22);
1180
1181 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM4)
10
Assuming field 'vcp_nnics' is <= VMM_MAX_NICS_PER_VM
11
Taking false branch
1182 return (EINVAL22);
1183
1184 if (vcp->vcp_nmemranges == 0 ||
12
Assuming field 'vcp_nmemranges' is not equal to 0
14
Taking false branch
1185 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES16)
13
Assuming field 'vcp_nmemranges' is <= VMM_MAX_MEM_RANGES
1186 return (EINVAL22);
1187
1188 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
15
Memory is allocated
1189 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1190 if (tid == NULL((void*)0) || vrp == NULL((void*)0)) {
16
Assuming 'tid' is not equal to NULL
17
Assuming 'vrp' is equal to NULL
18
Taking true branch
1191 log_warn("%s: memory allocation error - exiting.",
19
Potential leak of memory pointed to by 'tid'
1192 __progname);
1193 return (ENOMEM12);
1194 }
1195
1196 log_debug("%s: initializing hardware for vm %s", __func__,
1197 vcp->vcp_name);
1198
1199 if (!(current_vm->vm_state & VM_STATE_RECEIVED0x08))
1200 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps);
1201
1202 ret = pthread_mutex_init(&threadmutex, NULL((void*)0));
1203 if (ret) {
1204 log_warn("%s: could not initialize thread state mutex",
1205 __func__);
1206 return (ret);
1207 }
1208 ret = pthread_cond_init(&threadcond, NULL((void*)0));
1209 if (ret) {
1210 log_warn("%s: could not initialize thread state "
1211 "condition variable", __func__);
1212 return (ret);
1213 }
1214
1215 mutex_lock(&threadmutex);
1216
1217 log_debug("%s: starting vcpu threads for vm %s", __func__,
1218 vcp->vcp_name);
1219
1220 /*
1221 * Create and launch one thread for each VCPU. These threads may
1222 * migrate between PCPUs over time; the need to reload CPU state
1223 * in such situations is detected and performed by vmm(4) in the
1224 * kernel.
1225 */
1226 for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1227 vrp[i] = malloc(sizeof(struct vm_run_params));
1228 if (vrp[i] == NULL((void*)0)) {
1229 log_warn("%s: memory allocation error - "
1230 "exiting.", __progname);
1231 /* caller will exit, so skip freeing */
1232 return (ENOMEM12);
1233 }
1234 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1235 if (vrp[i]->vrp_exit == NULL((void*)0)) {
1236 log_warn("%s: memory allocation error - "
1237 "exiting.", __progname);
1238 /* caller will exit, so skip freeing */
1239 return (ENOMEM12);
1240 }
1241 vrp[i]->vrp_vm_id = vcp->vcp_id;
1242 vrp[i]->vrp_vcpu_id = i;
1243
1244 if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1245 log_warnx("%s: cannot reset VCPU %zu - exiting.",
1246 __progname, i);
1247 return (EIO5);
1248 }
1249
1250 /* once more because reset_cpu changes regs */
1251 if (current_vm->vm_state & VM_STATE_RECEIVED0x08) {
1252 vregsp.vrwp_vm_id = vcp->vcp_id;
1253 vregsp.vrwp_vcpu_id = i;
1254 vregsp.vrwp_regs = *vrs;
1255 vregsp.vrwp_mask = VM_RWREGS_ALL(0x1 | 0x2 | 0x4 | 0x8 | 0x10);
1256 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS((unsigned long)0x80000000 | ((sizeof(struct vm_rwregs_params
) & 0x1fff) << 16) | ((('V')) << 8) | ((8)))
,
1257 &vregsp)) == -1) {
1258 log_warn("%s: writeregs failed", __func__);
1259 return (ret);
1260 }
1261 }
1262
1263 ret = pthread_cond_init(&vcpu_run_cond[i], NULL((void*)0));
1264 if (ret) {
1265 log_warnx("%s: cannot initialize cond var (%d)",
1266 __progname, ret);
1267 return (ret);
1268 }
1269
1270 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL((void*)0));
1271 if (ret) {
1272 log_warnx("%s: cannot initialize mtx (%d)",
1273 __progname, ret);
1274 return (ret);
1275 }
1276
1277 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL((void*)0));
1278 if (ret) {
1279 log_warnx("%s: cannot initialize unpause var (%d)",
1280 __progname, ret);
1281 return (ret);
1282 }
1283
1284 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL((void*)0));
1285 if (ret) {
1286 log_warnx("%s: cannot initialize unpause mtx (%d)",
1287 __progname, ret);
1288 return (ret);
1289 }
1290
1291 vcpu_hlt[i] = 0;
1292
1293 /* Start each VCPU run thread at vcpu_run_loop */
1294 ret = pthread_create(&tid[i], NULL((void*)0), vcpu_run_loop, vrp[i]);
1295 if (ret) {
1296 /* caller will _exit after this return */
1297 ret = errno(*__errno());
1298 log_warn("%s: could not create vcpu thread %zu",
1299 __func__, i);
1300 return (ret);
1301 }
1302 }
1303
1304 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1305 ret = pthread_create(&evtid, NULL((void*)0), event_thread, &evdone);
1306 if (ret) {
1307 errno(*__errno()) = ret;
1308 log_warn("%s: could not create event thread", __func__);
1309 return (ret);
1310 }
1311
1312 for (;;) {
1313 ret = pthread_cond_wait(&threadcond, &threadmutex);
1314 if (ret) {
1315 log_warn("%s: waiting on thread state condition "
1316 "variable failed", __func__);
1317 return (ret);
1318 }
1319
1320 /*
1321 * Did a VCPU thread exit with an error? => return the first one
1322 */
1323 for (i = 0; i < vcp->vcp_ncpus; i++) {
1324 if (vcpu_done[i] == 0)
1325 continue;
1326
1327 if (pthread_join(tid[i], &exit_status)) {
1328 log_warn("%s: failed to join thread %zd - "
1329 "exiting", __progname, i);
1330 return (EIO5);
1331 }
1332
1333 ret = (intptr_t)exit_status;
1334 }
1335
1336 /* Did the event thread exit? => return with an error */
1337 if (evdone) {
1338 if (pthread_join(evtid, &exit_status)) {
1339 log_warn("%s: failed to join event thread - "
1340 "exiting", __progname);
1341 return (EIO5);
1342 }
1343
1344 log_warnx("%s: vm %d event thread exited "
1345 "unexpectedly", __progname, vcp->vcp_id);
1346 return (EIO5);
1347 }
1348
1349 /* Did all VCPU threads exit successfully? => return */
1350 for (i = 0; i < vcp->vcp_ncpus; i++) {
1351 if (vcpu_done[i] == 0)
1352 break;
1353 }
1354 if (i == vcp->vcp_ncpus)
1355 return (ret);
1356
1357 /* Some more threads to wait for, start over */
1358 }
1359
1360 return (ret);
1361}
1362
1363void *
1364event_thread(void *arg)
1365{
1366 uint8_t *donep = arg;
1367 intptr_t ret;
1368
1369 ret = event_dispatch();
1370
1371 mutex_lock(&threadmutex);
1372 *donep = 1;
1373 pthread_cond_signal(&threadcond);
1374 mutex_unlock(&threadmutex);
1375
1376 return (void *)ret;
1377 }
1378
1379/*
1380 * vcpu_run_loop
1381 *
1382 * Runs a single VCPU until vmm(4) requires help handling an exit,
1383 * or the VM terminates.
1384 *
1385 * Parameters:
1386 * arg: vcpu_run_params for the VCPU being run by this thread
1387 *
1388 * Return values:
1389 * NULL: the VCPU shutdown properly
1390 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1391 */
1392void *
1393vcpu_run_loop(void *arg)
1394{
1395 struct vm_run_params *vrp = (struct vm_run_params *)arg;
1396 intptr_t ret = 0;
1397 int irq;
1398 uint32_t n;
1399
1400 vrp->vrp_continue = 0;
1401 n = vrp->vrp_vcpu_id;
1402
1403 for (;;) {
1404 ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1405
1406 if (ret) {
1407 log_warnx("%s: can't lock vcpu run mtx (%d)",
1408 __func__, (int)ret);
1409 return ((void *)ret);
1410 }
1411
1412 /* If we are halted and need to pause, pause */
1413 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED0x10)) {
1414 ret = pthread_barrier_wait(&vm_pause_barrier);
1415 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD-1) {
1416 log_warnx("%s: could not wait on pause barrier (%d)",
1417 __func__, (int)ret);
1418 return ((void *)ret);
1419 }
1420
1421 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1422 if (ret) {
1423 log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1424 __func__, (int)ret);
1425 return ((void *)ret);
1426 }
1427
1428 ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1429 &vcpu_unpause_mtx[n]);
1430 if (ret) {
1431 log_warnx(
1432 "%s: can't wait on unpause cond (%d)",
1433 __func__, (int)ret);
1434 break;
1435 }
1436 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1437 if (ret) {
1438 log_warnx("%s: can't unlock unpause mtx (%d)",
1439 __func__, (int)ret);
1440 break;
1441 }
1442 }
1443
1444 /* If we are halted and not paused, wait */
1445 if (vcpu_hlt[n]) {
1446 ret = pthread_cond_wait(&vcpu_run_cond[n],
1447 &vcpu_run_mtx[n]);
1448
1449 if (ret) {
1450 log_warnx(
1451 "%s: can't wait on cond (%d)",
1452 __func__, (int)ret);
1453 (void)pthread_mutex_unlock(
1454 &vcpu_run_mtx[n]);
1455 break;
1456 }
1457 }
1458
1459 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1460
1461 if (ret) {
1462 log_warnx("%s: can't unlock mutex on cond (%d)",
1463 __func__, (int)ret);
1464 break;
1465 }
1466
1467 if (vrp->vrp_irqready && i8259_is_pending()) {
1468 irq = i8259_ack();
1469 vrp->vrp_irq = irq;
1470 } else
1471 vrp->vrp_irq = 0xFFFF;
1472
1473 /* Still more pending? */
1474 if (i8259_is_pending()) {
1475 /*
1476 * XXX can probably avoid ioctls here by providing intr
1477 * in vrp
1478 */
1479 if (vcpu_pic_intr(vrp->vrp_vm_id,
1480 vrp->vrp_vcpu_id, 1)) {
1481 fatal("can't set INTR");
1482 }
1483 } else {
1484 if (vcpu_pic_intr(vrp->vrp_vm_id,
1485 vrp->vrp_vcpu_id, 0)) {
1486 fatal("can't clear INTR");
1487 }
1488 }
1489
1490 if (ioctl(env->vmd_fd, VMM_IOC_RUN(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof
(struct vm_run_params) & 0x1fff) << 16) | ((('V')) <<
8) | ((2)))
, vrp) == -1) {
1491 /* If run ioctl failed, exit */
1492 ret = errno(*__errno());
1493 log_warn("%s: vm %d / vcpu %d run ioctl failed",
1494 __func__, vrp->vrp_vm_id, n);
1495 break;
1496 }
1497
1498 /* If the VM is terminating, exit normally */
1499 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED0xFFFE) {
1500 ret = (intptr_t)NULL((void*)0);
1501 break;
1502 }
1503
1504 if (vrp->vrp_exit_reason != VM_EXIT_NONE0xFFFF) {
1505 /*
1506 * vmm(4) needs help handling an exit, handle in
1507 * vcpu_exit.
1508 */
1509 ret = vcpu_exit(vrp);
1510 if (ret)
1511 break;
1512 }
1513 }
1514
1515 mutex_lock(&threadmutex);
1516 vcpu_done[n] = 1;
1517 pthread_cond_signal(&threadcond);
1518 mutex_unlock(&threadmutex);
1519
1520 return ((void *)ret);
1521}
1522
1523int
1524vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1525{
1526 struct vm_intr_params vip;
1527
1528 memset(&vip, 0, sizeof(vip));
1529
1530 vip.vip_vm_id = vm_id;
1531 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1532 vip.vip_intr = intr;
1533
1534 if (ioctl(env->vmd_fd, VMM_IOC_INTR((unsigned long)0x80000000 | ((sizeof(struct vm_intr_params) &
0x1fff) << 16) | ((('V')) << 8) | ((6)))
, &vip) == -1)
1535 return (errno(*__errno()));
1536
1537 return (0);
1538}
1539
1540/*
1541 * vcpu_exit_pci
1542 *
1543 * Handle all I/O to the emulated PCI subsystem.
1544 *
1545 * Parameters:
1546 * vrp: vcpu run paramters containing guest state for this exit
1547 *
1548 * Return value:
1549 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1550 * be injected.
1551 */
1552uint8_t
1553vcpu_exit_pci(struct vm_run_params *vrp)
1554{
1555 struct vm_exit *vei = vrp->vrp_exit;
1556 uint8_t intr;
1557
1558 intr = 0xFF;
1559
1560 switch (vei->vei.vei_port) {
1561 case PCI_MODE1_ADDRESS_REG0x0cf8:
1562 pci_handle_address_reg(vrp);
1563 break;
1564 case PCI_MODE1_DATA_REG0x0cfc:
1565 case PCI_MODE1_DATA_REG0x0cfc + 1:
1566 case PCI_MODE1_DATA_REG0x0cfc + 2:
1567 case PCI_MODE1_DATA_REG0x0cfc + 3:
1568 pci_handle_data_reg(vrp);
1569 break;
1570 case VMM_PCI_IO_BAR_BASE0x1000 ... VMM_PCI_IO_BAR_END0xFFFF:
1571 intr = pci_handle_io(vrp);
1572 break;
1573 default:
1574 log_warnx("%s: unknown PCI register 0x%llx",
1575 __progname, (uint64_t)vei->vei.vei_port);
1576 break;
1577 }
1578
1579 return (intr);
1580}
1581
1582/*
1583 * vcpu_exit_inout
1584 *
1585 * Handle all I/O exits that need to be emulated in vmd. This includes the
1586 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1587 *
1588 * Parameters:
1589 * vrp: vcpu run parameters containing guest state for this exit
1590 */
1591void
1592vcpu_exit_inout(struct vm_run_params *vrp)
1593{
1594 struct vm_exit *vei = vrp->vrp_exit;
1595 uint8_t intr = 0xFF;
1596
1597 if (ioports_map[vei->vei.vei_port] != NULL((void*)0))
1598 intr = ioports_map[vei->vei.vei_port](vrp);
1599 else if (vei->vei.vei_dir == VEI_DIR_IN)
1600 set_return_data(vei, 0xFFFFFFFF);
1601
1602 if (intr != 0xFF)
1603 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1604}
1605
1606/*
1607 * vcpu_exit_eptviolation
1608 *
1609 * handle an EPT Violation
1610 *
1611 * Parameters:
1612 * vrp: vcpu run parameters containing guest state for this exit
1613 *
1614 * Return values:
1615 * 0: no action required
1616 * EAGAIN: a protection fault occured, kill the vm.
1617 */
1618int
1619vcpu_exit_eptviolation(struct vm_run_params *vrp)
1620{
1621 struct vm_exit *ve = vrp->vrp_exit;
1622
1623 /*
1624 * vmd may be exiting to vmd to handle a pending interrupt
1625 * but last exit type may have been VMX_EXIT_EPT_VIOLATION,
1626 * check the fault_type to ensure we really are processing
1627 * a VMX_EXIT_EPT_VIOLATION.
1628 */
1629 if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) {
1630 log_debug("%s: EPT Violation: rip=0x%llx",
1631 __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP16]);
1632 return (EAGAIN35);
1633 }
1634
1635 return (0);
1636}
1637
1638/*
1639 * vcpu_exit
1640 *
1641 * Handle a vcpu exit. This function is called when it is determined that
1642 * vmm(4) requires the assistance of vmd to support a particular guest
1643 * exit type (eg, accessing an I/O port or device). Guest state is contained
1644 * in 'vrp', and will be resent to vmm(4) on exit completion.
1645 *
1646 * Upon conclusion of handling the exit, the function determines if any
1647 * interrupts should be injected into the guest, and asserts the proper
1648 * IRQ line whose interrupt should be vectored.
1649 *
1650 * Parameters:
1651 * vrp: vcpu run parameters containing guest state for this exit
1652 *
1653 * Return values:
1654 * 0: the exit was handled successfully
1655 * 1: an error occurred (eg, unknown exit reason passed in 'vrp')
1656 */
1657int
1658vcpu_exit(struct vm_run_params *vrp)
1659{
1660 int ret;
1661
1662 switch (vrp->vrp_exit_reason) {
1663 case VMX_EXIT_INT_WINDOW7:
1664 case SVM_VMEXIT_VINTR0x64:
1665 case VMX_EXIT_CPUID10:
1666 case VMX_EXIT_EXTINT1:
1667 case SVM_VMEXIT_INTR0x60:
1668 case SVM_VMEXIT_NPF0x400:
1669 case SVM_VMEXIT_MSR0x7C:
1670 case SVM_VMEXIT_CPUID0x72:
1671 /*
1672 * We may be exiting to vmd to handle a pending interrupt but
1673 * at the same time the last exit type may have been one of
1674 * these. In this case, there's nothing extra to be done
1675 * here (and falling through to the default case below results
1676 * in more vmd log spam).
1677 */
1678 break;
1679 case VMX_EXIT_EPT_VIOLATION48:
1680 ret = vcpu_exit_eptviolation(vrp);
1681 if (ret)
1682 return (ret);
1683
1684 break;
1685 case VMX_EXIT_IO30:
1686 case SVM_VMEXIT_IOIO0x7B:
1687 vcpu_exit_inout(vrp);
1688 break;
1689 case VMX_EXIT_HLT12:
1690 case SVM_VMEXIT_HLT0x78:
1691 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1692 if (ret) {
1693 log_warnx("%s: can't lock vcpu mutex (%d)",
1694 __func__, ret);
1695 return (ret);
1696 }
1697 vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1698 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1699 if (ret) {
1700 log_warnx("%s: can't unlock vcpu mutex (%d)",
1701 __func__, ret);
1702 return (ret);
1703 }
1704 break;
1705 case VMX_EXIT_TRIPLE_FAULT2:
1706 case SVM_VMEXIT_SHUTDOWN0x7F:
1707 /* reset VM */
1708 return (EAGAIN35);
1709 default:
1710 log_debug("%s: unknown exit reason 0x%x",
1711 __progname, vrp->vrp_exit_reason);
1712 }
1713
1714 vrp->vrp_continue = 1;
1715
1716 return (0);
1717}
1718
1719/*
1720 * find_gpa_range
1721 *
1722 * Search for a contiguous guest physical mem range.
1723 *
1724 * Parameters:
1725 * vcp: VM create parameters that contain the memory map to search in
1726 * gpa: the starting guest physical address
1727 * len: the length of the memory range
1728 *
1729 * Return values:
1730 * NULL: on failure if there is no memory range as described by the parameters
1731 * Pointer to vm_mem_range that contains the start of the range otherwise.
1732 */
1733static struct vm_mem_range *
1734find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1735{
1736 size_t i, n;
1737 struct vm_mem_range *vmr;
1738
1739 /* Find the first vm_mem_range that contains gpa */
1740 for (i = 0; i < vcp->vcp_nmemranges; i++) {
1741 vmr = &vcp->vcp_memranges[i];
1742 if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1743 break;
1744 }
1745
1746 /* No range found. */
1747 if (i == vcp->vcp_nmemranges)
1748 return (NULL((void*)0));
1749
1750 /*
1751 * vmr may cover the range [gpa, gpa + len) only partly. Make
1752 * sure that the following vm_mem_ranges are contiguous and
1753 * cover the rest.
1754 */
1755 n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1756 if (len < n)
1757 len = 0;
1758 else
1759 len -= n;
1760 gpa = vmr->vmr_gpa + vmr->vmr_size;
1761 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1762 vmr = &vcp->vcp_memranges[i];
1763 if (gpa != vmr->vmr_gpa)
1764 return (NULL((void*)0));
1765 if (len <= vmr->vmr_size)
1766 len = 0;
1767 else
1768 len -= vmr->vmr_size;
1769
1770 gpa = vmr->vmr_gpa + vmr->vmr_size;
1771 }
1772
1773 if (len != 0)
1774 return (NULL((void*)0));
1775
1776 return (vmr);
1777}
1778
1779/*
1780 * write_mem
1781 *
1782 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1783 *
1784 * Parameters:
1785 * dst: the destination paddr_t in the guest VM
1786 * buf: data to copy (or NULL to zero the data)
1787 * len: number of bytes to copy
1788 *
1789 * Return values:
1790 * 0: success
1791 * EINVAL: if the guest physical memory range [dst, dst + len) does not
1792 * exist in the guest.
1793 */
1794int
1795write_mem(paddr_t dst, const void *buf, size_t len)
1796{
1797 const char *from = buf;
1798 char *to;
1799 size_t n, off;
1800 struct vm_mem_range *vmr;
1801
1802 vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
1803 if (vmr == NULL((void*)0)) {
1804 errno(*__errno()) = EINVAL22;
1805 log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1806 "len = 0x%zx", __func__, dst, len);
1807 return (EINVAL22);
1808 }
1809
1810 off = dst - vmr->vmr_gpa;
1811 while (len != 0) {
1812 n = vmr->vmr_size - off;
1813 if (len < n)
1814 n = len;
1815
1816 to = (char *)vmr->vmr_va + off;
1817 if (buf == NULL((void*)0))
1818 memset(to, 0, n);
1819 else {
1820 memcpy(to, from, n);
1821 from += n;
1822 }
1823 len -= n;
1824 off = 0;
1825 vmr++;
1826 }
1827
1828 return (0);
1829}
1830
1831/*
1832 * read_mem
1833 *
1834 * Reads memory at guest paddr 'src' into 'buf'.
1835 *
1836 * Parameters:
1837 * src: the source paddr_t in the guest VM to read from.
1838 * buf: destination (local) buffer
1839 * len: number of bytes to read
1840 *
1841 * Return values:
1842 * 0: success
1843 * EINVAL: if the guest physical memory range [dst, dst + len) does not
1844 * exist in the guest.
1845 */
1846int
1847read_mem(paddr_t src, void *buf, size_t len)
1848{
1849 char *from, *to = buf;
1850 size_t n, off;
1851 struct vm_mem_range *vmr;
1852
1853 vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
1854 if (vmr == NULL((void*)0)) {
1855 errno(*__errno()) = EINVAL22;
1856 log_warn("%s: failed - invalid memory range src = 0x%lx, "
1857 "len = 0x%zx", __func__, src, len);
1858 return (EINVAL22);
1859 }
1860
1861 off = src - vmr->vmr_gpa;
1862 while (len != 0) {
1863 n = vmr->vmr_size - off;
1864 if (len < n)
1865 n = len;
1866
1867 from = (char *)vmr->vmr_va + off;
1868 memcpy(to, from, n);
1869
1870 to += n;
1871 len -= n;
1872 off = 0;
1873 vmr++;
1874 }
1875
1876 return (0);
1877}
1878
1879/*
1880 * vcpu_assert_pic_irq
1881 *
1882 * Injects the specified IRQ on the supplied vcpu/vm
1883 *
1884 * Parameters:
1885 * vm_id: VM ID to inject to
1886 * vcpu_id: VCPU ID to inject to
1887 * irq: IRQ to inject
1888 */
1889void
1890vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1891{
1892 int ret;
1893
1894 i8259_assert_irq(irq);
1895
1896 if (i8259_is_pending()) {
1897 if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1898 fatalx("%s: can't assert INTR", __func__);
1899
1900 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1901 if (ret)
1902 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1903
1904 vcpu_hlt[vcpu_id] = 0;
1905 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1906 if (ret)
1907 fatalx("%s: can't signal (%d)", __func__, ret);
1908 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1909 if (ret)
1910 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1911 }
1912}
1913
1914/*
1915 * vcpu_deassert_pic_irq
1916 *
1917 * Clears the specified IRQ on the supplied vcpu/vm
1918 *
1919 * Parameters:
1920 * vm_id: VM ID to clear in
1921 * vcpu_id: VCPU ID to clear in
1922 * irq: IRQ to clear
1923 */
1924void
1925vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1926{
1927 i8259_deassert_irq(irq);
1928
1929 if (!i8259_is_pending()) {
1930 if (vcpu_pic_intr(vm_id, vcpu_id, 0))
1931 fatalx("%s: can't deassert INTR for vm_id %d, "
1932 "vcpu_id %d", __func__, vm_id, vcpu_id);
1933 }
1934}
1935
1936/*
1937 * fd_hasdata
1938 *
1939 * Determines if data can be read from a file descriptor.
1940 *
1941 * Parameters:
1942 * fd: the fd to check
1943 *
1944 * Return values:
1945 * 1 if data can be read from an fd, or 0 otherwise.
1946 */
1947int
1948fd_hasdata(int fd)
1949{
1950 struct pollfd pfd[1];
1951 int nready, hasdata = 0;
1952
1953 pfd[0].fd = fd;
1954 pfd[0].events = POLLIN0x0001;
1955 nready = poll(pfd, 1, 0);
1956 if (nready == -1)
1957 log_warn("checking file descriptor for data failed");
1958 else if (nready == 1 && pfd[0].revents & POLLIN0x0001)
1959 hasdata = 1;
1960 return (hasdata);
1961}
1962
1963/*
1964 * mutex_lock
1965 *
1966 * Wrapper function for pthread_mutex_lock that does error checking and that
1967 * exits on failure
1968 */
1969void
1970mutex_lock(pthread_mutex_t *m)
1971{
1972 int ret;
1973
1974 ret = pthread_mutex_lock(m);
1975 if (ret) {
1976 errno(*__errno()) = ret;
1977 fatal("could not acquire mutex");
1978 }
1979}
1980
1981/*
1982 * mutex_unlock
1983 *
1984 * Wrapper function for pthread_mutex_unlock that does error checking and that
1985 * exits on failure
1986 */
1987void
1988mutex_unlock(pthread_mutex_t *m)
1989{
1990 int ret;
1991
1992 ret = pthread_mutex_unlock(m);
1993 if (ret) {
1994 errno(*__errno()) = ret;
1995 fatal("could not release mutex");
1996 }
1997}
1998
1999/*
2000 * set_return_data
2001 *
2002 * Utility function for manipulating register data in vm exit info structs. This
2003 * function ensures that the data is copied to the vei->vei.vei_data field with
2004 * the proper size for the operation being performed.
2005 *
2006 * Parameters:
2007 * vei: exit information
2008 * data: return data
2009 */
2010void
2011set_return_data(struct vm_exit *vei, uint32_t data)
2012{
2013 switch (vei->vei.vei_size) {
2014 case 1:
2015 vei->vei.vei_data &= ~0xFF;
2016 vei->vei.vei_data |= (uint8_t)data;
2017 break;
2018 case 2:
2019 vei->vei.vei_data &= ~0xFFFF;
2020 vei->vei.vei_data |= (uint16_t)data;
2021 break;
2022 case 4:
2023 vei->vei.vei_data = data;
2024 break;
2025 }
2026}
2027
2028/*
2029 * get_input_data
2030 *
2031 * Utility function for manipulating register data in vm exit info
2032 * structs. This function ensures that the data is copied from the
2033 * vei->vei.vei_data field with the proper size for the operation being
2034 * performed.
2035 *
2036 * Parameters:
2037 * vei: exit information
2038 * data: location to store the result
2039 */
2040void
2041get_input_data(struct vm_exit *vei, uint32_t *data)
2042{
2043 switch (vei->vei.vei_size) {
2044 case 1:
2045 *data &= 0xFFFFFF00;
2046 *data |= (uint8_t)vei->vei.vei_data;
2047 break;
2048 case 2:
2049 *data &= 0xFFFF0000;
2050 *data |= (uint16_t)vei->vei.vei_data;
2051 break;
2052 case 4:
2053 *data = vei->vei.vei_data;
2054 break;
2055 default:
2056 log_warnx("%s: invalid i/o size %d", __func__,
2057 vei->vei.vei_size);
2058 }
2059
2060}
2061
2062/*
2063 * translate_gva
2064 *
2065 * Translates a guest virtual address to a guest physical address by walking
2066 * the currently active page table (if needed).
2067 *
2068 * Note - this function can possibly alter the supplied VCPU state.
2069 * Specifically, it may inject exceptions depending on the current VCPU
2070 * configuration, and may alter %cr2 on #PF. Consequently, this function
2071 * should only be used as part of instruction emulation.
2072 *
2073 * Parameters:
2074 * exit: The VCPU this translation should be performed for (guest MMU settings
2075 * are gathered from this VCPU)
2076 * va: virtual address to translate
2077 * pa: pointer to paddr_t variable that will receive the translated physical
2078 * address. 'pa' is unchanged on error.
2079 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2080 * the address should be translated
2081 *
2082 * Return values:
2083 * 0: the address was successfully translated - 'pa' contains the physical
2084 * address currently mapped by 'va'.
2085 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2086 * and %cr2 set in the vcpu structure.
2087 * EINVAL: an error occurred reading paging table structures
2088 */
2089int
2090translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2091{
2092 int level, shift, pdidx;
2093 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2094 uint64_t shift_width, pte_size;
2095 struct vcpu_reg_state *vrs;
2096
2097 vrs = &exit->vrs;
2098
2099 if (!pa)
2100 return (EINVAL22);
2101
2102 if (!(vrs->vrs_crs[VCPU_REGS_CR00] & CR0_PG0x80000000)) {
2103 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2104 *pa = va;
2105 return (0);
2106 }
2107
2108 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR32];
2109
2110 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2111 vrs->vrs_crs[VCPU_REGS_CR00], vrs->vrs_crs[VCPU_REGS_CR32]);
2112
2113 if (vrs->vrs_crs[VCPU_REGS_CR00] & CR0_PE0x00000001) {
2114 if (vrs->vrs_crs[VCPU_REGS_CR43] & CR4_PAE0x00000020) {
2115 pte_size = sizeof(uint64_t);
2116 shift_width = 9;
2117
2118 if (vrs->vrs_msrs[VCPU_REGS_EFER0] & EFER_LMA0x00000400) {
2119 /* 4 level paging */
2120 level = 4;
2121 mask = L4_MASK0x0000ff8000000000UL;
2122 shift = L4_SHIFT39;
2123 } else {
2124 /* 32 bit with PAE paging */
2125 level = 3;
2126 mask = L3_MASK0x0000007fc0000000UL;
2127 shift = L3_SHIFT30;
2128 }
2129 } else {
2130 /* 32 bit paging */
2131 level = 2;
2132 shift_width = 10;
2133 mask = 0xFFC00000;
2134 shift = 22;
2135 pte_size = sizeof(uint32_t);
2136 }
2137 } else
2138 return (EINVAL22);
2139
2140 /* XXX: Check for R bit in segment selector and set A bit */
2141
2142 for (;level > 0; level--) {
2143 pdidx = (va & mask) >> shift;
2144 pte_paddr = (pt_paddr) + (pdidx * pte_size);
2145
2146 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2147 level, pte_paddr);
2148 if (read_mem(pte_paddr, &pte, pte_size)) {
2149 log_warn("%s: failed to read pte", __func__);
2150 return (EFAULT14);
2151 }
2152
2153 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2154 pte);
2155
2156 /* XXX: Set CR2 */
2157 if (!(pte & PG_V0x0000000000000001UL))
2158 return (EFAULT14);
2159
2160 /* XXX: Check for SMAP */
2161 if ((mode == PROT_WRITE0x02) && !(pte & PG_RW0x0000000000000002UL))
2162 return (EPERM1);
2163
2164 if ((exit->cpl > 0) && !(pte & PG_u0x0000000000000004UL))
2165 return (EPERM1);
2166
2167 pte = pte | PG_U0x0000000000000020UL;
2168 if (mode == PROT_WRITE0x02)
2169 pte = pte | PG_M0x0000000000000040UL;
2170 if (write_mem(pte_paddr, &pte, pte_size)) {
2171 log_warn("%s: failed to write back flags to pte",
2172 __func__);
2173 return (EIO5);
2174 }
2175
2176 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
2177 if (pte & PG_PS0x0000000000000080UL)
2178 break;
2179
2180 if (level > 1) {
2181 pt_paddr = pte & PG_FRAME0x000ffffffffff000UL;
2182 shift -= shift_width;
2183 mask = mask >> shift_width;
2184 }
2185 }
2186
2187 low_mask = (1 << shift) - 1;
2188 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2189 *pa = (pte & high_mask) | (va & low_mask);
2190
2191 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2192
2193 return (0);
2194}
2195
2196/*
2197 * vm_pipe_init
2198 *
2199 * Initialize a vm_dev_pipe, setting up its file descriptors and its
2200 * event structure with the given callback.
2201 *
2202 * Parameters:
2203 * p: pointer to vm_dev_pipe struct to initizlize
2204 * cb: callback to use for READ events on the read end of the pipe
2205 */
2206void
2207vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2208{
2209 int ret;
2210 int fds[2];
2211
2212 memset(p, 0, sizeof(struct vm_dev_pipe));
2213
2214 ret = pipe(fds);
2215 if (ret)
2216 fatal("failed to create vm_dev_pipe pipe");
2217
2218 p->read = fds[0];
2219 p->write = fds[1];
2220
2221 event_set(&p->read_ev, p->read, EV_READ0x02 | EV_PERSIST0x10, cb, NULL((void*)0));
2222}
2223
2224/*
2225 * vm_pipe_send
2226 *
2227 * Send a message to an emulated device vie the provided vm_dev_pipe.
2228 *
2229 * Parameters:
2230 * p: pointer to initialized vm_dev_pipe
2231 * msg: message to send in the channel
2232 */
2233void
2234vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2235{
2236 size_t n;
2237 n = write(p->write, &msg, sizeof(msg));
2238 if (n != sizeof(msg))
2239 fatal("failed to write to device pipe");
2240}
2241
2242/*
2243 * vm_pipe_recv
2244 *
2245 * Receive a message for an emulated device via the provided vm_dev_pipe.
2246 * Returns the message value, otherwise will exit on failure.
2247 *
2248 * Parameters:
2249 * p: pointer to initialized vm_dev_pipe
2250 *
2251 * Return values:
2252 * a value of enum pipe_msg_type or fatal exit on read(2) error
2253 */
2254enum pipe_msg_type
2255vm_pipe_recv(struct vm_dev_pipe *p)
2256{
2257 size_t n;
2258 enum pipe_msg_type msg;
2259 n = read(p->read, &msg, sizeof(msg));
2260 if (n != sizeof(msg))
2261 fatal("failed to read from device pipe");
2262
2263 return msg;
2264}