File: | src/usr.sbin/vmd/vm.c |
Warning: | line 1191, column 3 Potential leak of memory pointed to by 'tid' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* $OpenBSD: vm.c,v 1.67 2021/12/30 08:12:23 claudio Exp $ */ | |||
2 | ||||
3 | /* | |||
4 | * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> | |||
5 | * | |||
6 | * Permission to use, copy, modify, and distribute this software for any | |||
7 | * purpose with or without fee is hereby granted, provided that the above | |||
8 | * copyright notice and this permission notice appear in all copies. | |||
9 | * | |||
10 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |||
11 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |||
12 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |||
13 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |||
14 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |||
15 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |||
16 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |||
17 | */ | |||
18 | ||||
19 | #include <sys/param.h> /* PAGE_SIZE */ | |||
20 | #include <sys/types.h> | |||
21 | #include <sys/ioctl.h> | |||
22 | #include <sys/queue.h> | |||
23 | #include <sys/wait.h> | |||
24 | #include <sys/uio.h> | |||
25 | #include <sys/stat.h> | |||
26 | #include <sys/socket.h> | |||
27 | #include <sys/time.h> | |||
28 | #include <sys/mman.h> | |||
29 | ||||
30 | #include <dev/ic/i8253reg.h> | |||
31 | #include <dev/isa/isareg.h> | |||
32 | #include <dev/pci/pcireg.h> | |||
33 | ||||
34 | #include <machine/psl.h> | |||
35 | #include <machine/pte.h> | |||
36 | #include <machine/specialreg.h> | |||
37 | #include <machine/vmmvar.h> | |||
38 | ||||
39 | #include <net/if.h> | |||
40 | ||||
41 | #include <errno(*__errno()).h> | |||
42 | #include <event.h> | |||
43 | #include <fcntl.h> | |||
44 | #include <imsg.h> | |||
45 | #include <limits.h> | |||
46 | #include <poll.h> | |||
47 | #include <pthread.h> | |||
48 | #include <stddef.h> | |||
49 | #include <stdio.h> | |||
50 | #include <stdlib.h> | |||
51 | #include <string.h> | |||
52 | #include <unistd.h> | |||
53 | #include <util.h> | |||
54 | ||||
55 | #include "atomicio.h" | |||
56 | #include "fw_cfg.h" | |||
57 | #include "i8253.h" | |||
58 | #include "i8259.h" | |||
59 | #include "loadfile.h" | |||
60 | #include "mc146818.h" | |||
61 | #include "ns8250.h" | |||
62 | #include "pci.h" | |||
63 | #include "virtio.h" | |||
64 | #include "vmd.h" | |||
65 | #include "vmm.h" | |||
66 | ||||
67 | io_fn_t ioports_map[MAX_PORTS65536]; | |||
68 | ||||
69 | int run_vm(int, int[][VM_MAX_BASE_PER_DISK4], int *, | |||
70 | struct vmop_create_params *, struct vcpu_reg_state *); | |||
71 | void vm_dispatch_vmm(int, short, void *); | |||
72 | void *event_thread(void *); | |||
73 | void *vcpu_run_loop(void *); | |||
74 | int vcpu_exit(struct vm_run_params *); | |||
75 | int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); | |||
76 | void create_memory_map(struct vm_create_params *); | |||
77 | int alloc_guest_mem(struct vm_create_params *); | |||
78 | int vmm_create_vm(struct vm_create_params *); | |||
79 | void init_emulated_hw(struct vmop_create_params *, int, | |||
80 | int[][VM_MAX_BASE_PER_DISK4], int *); | |||
81 | void restore_emulated_hw(struct vm_create_params *, int, int *, | |||
82 | int[][VM_MAX_BASE_PER_DISK4],int); | |||
83 | void vcpu_exit_inout(struct vm_run_params *); | |||
84 | int vcpu_exit_eptviolation(struct vm_run_params *); | |||
85 | uint8_t vcpu_exit_pci(struct vm_run_params *); | |||
86 | int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); | |||
87 | int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); | |||
88 | int send_vm(int, struct vm_create_params *); | |||
89 | int dump_send_header(int); | |||
90 | int dump_vmr(int , struct vm_mem_range *); | |||
91 | int dump_mem(int, struct vm_create_params *); | |||
92 | void restore_vmr(int, struct vm_mem_range *); | |||
93 | void restore_mem(int, struct vm_create_params *); | |||
94 | int restore_vm_params(int, struct vm_create_params *); | |||
95 | void pause_vm(struct vm_create_params *); | |||
96 | void unpause_vm(struct vm_create_params *); | |||
97 | ||||
98 | int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); | |||
99 | ||||
100 | static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, | |||
101 | size_t); | |||
102 | ||||
103 | int con_fd; | |||
104 | struct vmd_vm *current_vm; | |||
105 | ||||
106 | extern struct vmd *env; | |||
107 | ||||
108 | extern char *__progname; | |||
109 | ||||
110 | pthread_mutex_t threadmutex; | |||
111 | pthread_cond_t threadcond; | |||
112 | ||||
113 | pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM64]; | |||
114 | pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM64]; | |||
115 | pthread_barrier_t vm_pause_barrier; | |||
116 | pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM64]; | |||
117 | pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM64]; | |||
118 | uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM64]; | |||
119 | uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM64]; | |||
120 | ||||
121 | /* | |||
122 | * Represents a standard register set for an OS to be booted | |||
123 | * as a flat 64 bit address space. | |||
124 | * | |||
125 | * NOT set here are: | |||
126 | * RIP | |||
127 | * RSP | |||
128 | * GDTR BASE | |||
129 | * | |||
130 | * Specific bootloaders should clone this structure and override | |||
131 | * those fields as needed. | |||
132 | * | |||
133 | * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on | |||
134 | * features of the CPU in use. | |||
135 | */ | |||
136 | static const struct vcpu_reg_state vcpu_init_flat64 = { | |||
137 | .vrs_gprs[VCPU_REGS_RFLAGS17] = 0x2, | |||
138 | .vrs_gprs[VCPU_REGS_RIP16] = 0x0, | |||
139 | .vrs_gprs[VCPU_REGS_RSP14] = 0x0, | |||
140 | .vrs_crs[VCPU_REGS_CR00] = CR0_ET0x00000010 | CR0_PE0x00000001 | CR0_PG0x80000000, | |||
141 | .vrs_crs[VCPU_REGS_CR32] = PML4_PAGE0x11000, | |||
142 | .vrs_crs[VCPU_REGS_CR43] = CR4_PAE0x00000020 | CR4_PSE0x00000010, | |||
143 | .vrs_crs[VCPU_REGS_PDPTE06] = 0ULL, | |||
144 | .vrs_crs[VCPU_REGS_PDPTE17] = 0ULL, | |||
145 | .vrs_crs[VCPU_REGS_PDPTE28] = 0ULL, | |||
146 | .vrs_crs[VCPU_REGS_PDPTE39] = 0ULL, | |||
147 | .vrs_sregs[VCPU_REGS_CS0] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, | |||
148 | .vrs_sregs[VCPU_REGS_DS1] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, | |||
149 | .vrs_sregs[VCPU_REGS_ES2] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, | |||
150 | .vrs_sregs[VCPU_REGS_FS3] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, | |||
151 | .vrs_sregs[VCPU_REGS_GS4] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, | |||
152 | .vrs_sregs[VCPU_REGS_SS5] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, | |||
153 | .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, | |||
154 | .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, | |||
155 | .vrs_sregs[VCPU_REGS_LDTR6] = { 0x0, 0xFFFF, 0x0082, 0x0}, | |||
156 | .vrs_sregs[VCPU_REGS_TR7] = { 0x0, 0xFFFF, 0x008B, 0x0}, | |||
157 | .vrs_msrs[VCPU_REGS_EFER0] = EFER_LME0x00000100 | EFER_LMA0x00000400, | |||
158 | .vrs_drs[VCPU_REGS_DR00] = 0x0, | |||
159 | .vrs_drs[VCPU_REGS_DR11] = 0x0, | |||
160 | .vrs_drs[VCPU_REGS_DR22] = 0x0, | |||
161 | .vrs_drs[VCPU_REGS_DR33] = 0x0, | |||
162 | .vrs_drs[VCPU_REGS_DR64] = 0xFFFF0FF0, | |||
163 | .vrs_drs[VCPU_REGS_DR75] = 0x400, | |||
164 | .vrs_msrs[VCPU_REGS_STAR1] = 0ULL, | |||
165 | .vrs_msrs[VCPU_REGS_LSTAR2] = 0ULL, | |||
166 | .vrs_msrs[VCPU_REGS_CSTAR3] = 0ULL, | |||
167 | .vrs_msrs[VCPU_REGS_SFMASK4] = 0ULL, | |||
168 | .vrs_msrs[VCPU_REGS_KGSBASE5] = 0ULL, | |||
169 | .vrs_msrs[VCPU_REGS_MISC_ENABLE6] = 0ULL, | |||
170 | .vrs_crs[VCPU_REGS_XCR05] = XCR0_X870x00000001 | |||
171 | }; | |||
172 | ||||
173 | /* | |||
174 | * Represents a standard register set for an BIOS to be booted | |||
175 | * as a flat 16 bit address space. | |||
176 | */ | |||
177 | static const struct vcpu_reg_state vcpu_init_flat16 = { | |||
178 | .vrs_gprs[VCPU_REGS_RFLAGS17] = 0x2, | |||
179 | .vrs_gprs[VCPU_REGS_RIP16] = 0xFFF0, | |||
180 | .vrs_gprs[VCPU_REGS_RSP14] = 0x0, | |||
181 | .vrs_crs[VCPU_REGS_CR00] = 0x60000010, | |||
182 | .vrs_crs[VCPU_REGS_CR32] = 0, | |||
183 | .vrs_sregs[VCPU_REGS_CS0] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, | |||
184 | .vrs_sregs[VCPU_REGS_DS1] = { 0x0, 0xFFFF, 0x8093, 0x0}, | |||
185 | .vrs_sregs[VCPU_REGS_ES2] = { 0x0, 0xFFFF, 0x8093, 0x0}, | |||
186 | .vrs_sregs[VCPU_REGS_FS3] = { 0x0, 0xFFFF, 0x8093, 0x0}, | |||
187 | .vrs_sregs[VCPU_REGS_GS4] = { 0x0, 0xFFFF, 0x8093, 0x0}, | |||
188 | .vrs_sregs[VCPU_REGS_SS5] = { 0x0, 0xFFFF, 0x8093, 0x0}, | |||
189 | .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, | |||
190 | .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, | |||
191 | .vrs_sregs[VCPU_REGS_LDTR6] = { 0x0, 0xFFFF, 0x0082, 0x0}, | |||
192 | .vrs_sregs[VCPU_REGS_TR7] = { 0x0, 0xFFFF, 0x008B, 0x0}, | |||
193 | .vrs_msrs[VCPU_REGS_EFER0] = 0ULL, | |||
194 | .vrs_drs[VCPU_REGS_DR00] = 0x0, | |||
195 | .vrs_drs[VCPU_REGS_DR11] = 0x0, | |||
196 | .vrs_drs[VCPU_REGS_DR22] = 0x0, | |||
197 | .vrs_drs[VCPU_REGS_DR33] = 0x0, | |||
198 | .vrs_drs[VCPU_REGS_DR64] = 0xFFFF0FF0, | |||
199 | .vrs_drs[VCPU_REGS_DR75] = 0x400, | |||
200 | .vrs_msrs[VCPU_REGS_STAR1] = 0ULL, | |||
201 | .vrs_msrs[VCPU_REGS_LSTAR2] = 0ULL, | |||
202 | .vrs_msrs[VCPU_REGS_CSTAR3] = 0ULL, | |||
203 | .vrs_msrs[VCPU_REGS_SFMASK4] = 0ULL, | |||
204 | .vrs_msrs[VCPU_REGS_KGSBASE5] = 0ULL, | |||
205 | .vrs_crs[VCPU_REGS_XCR05] = XCR0_X870x00000001 | |||
206 | }; | |||
207 | ||||
208 | /* | |||
209 | * loadfile_bios | |||
210 | * | |||
211 | * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image | |||
212 | * directly into memory. | |||
213 | * | |||
214 | * Parameters: | |||
215 | * fp: file of a kernel file to load | |||
216 | * size: uncompressed size of the image | |||
217 | * (out) vrs: register state to set on init for this kernel | |||
218 | * | |||
219 | * Return values: | |||
220 | * 0 if successful | |||
221 | * various error codes returned from read(2) or loadelf functions | |||
222 | */ | |||
223 | int | |||
224 | loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) | |||
225 | { | |||
226 | off_t off; | |||
227 | ||||
228 | /* Set up a "flat 16 bit" register state for BIOS */ | |||
229 | memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); | |||
230 | ||||
231 | /* Seek to the beginning of the BIOS image */ | |||
232 | if (gzseek(fp, 0, SEEK_SET0) == -1) | |||
233 | return (-1); | |||
234 | ||||
235 | /* The BIOS image must end at 1M */ | |||
236 | if ((off = 1048576 - size) < 0) | |||
237 | return (-1); | |||
238 | ||||
239 | /* Read BIOS image into memory */ | |||
240 | if (mread(fp, off, size) != (size_t)size) { | |||
241 | errno(*__errno()) = EIO5; | |||
242 | return (-1); | |||
243 | } | |||
244 | ||||
245 | log_debug("%s: loaded BIOS image", __func__); | |||
246 | ||||
247 | return (0); | |||
248 | } | |||
249 | ||||
250 | /* | |||
251 | * start_vm | |||
252 | * | |||
253 | * After forking a new VM process, starts the new VM with the creation | |||
254 | * parameters supplied (in the incoming vm->vm_params field). This | |||
255 | * function performs a basic sanity check on the incoming parameters | |||
256 | * and then performs the following steps to complete the creation of the VM: | |||
257 | * | |||
258 | * 1. validates and create the new VM | |||
259 | * 2. opens the imsg control channel to the parent and drops more privilege | |||
260 | * 3. drops additional privleges by calling pledge(2) | |||
261 | * 4. loads the kernel from the disk image or file descriptor | |||
262 | * 5. runs the VM's VCPU loops. | |||
263 | * | |||
264 | * Parameters: | |||
265 | * vm: The VM data structure that is including the VM create parameters. | |||
266 | * fd: The imsg socket that is connected to the parent process. | |||
267 | * | |||
268 | * Return values: | |||
269 | * 0: success | |||
270 | * !0 : failure - typically an errno indicating the source of the failure | |||
271 | */ | |||
272 | int | |||
273 | start_vm(struct vmd_vm *vm, int fd) | |||
274 | { | |||
275 | struct vmop_create_params *vmc = &vm->vm_params; | |||
276 | struct vm_create_params *vcp = &vmc->vmc_params; | |||
277 | struct vcpu_reg_state vrs; | |||
278 | int nicfds[VMM_MAX_NICS_PER_VM4]; | |||
279 | int ret; | |||
280 | gzFile fp; | |||
281 | size_t i; | |||
282 | struct vm_rwregs_params vrp; | |||
283 | struct stat sb; | |||
284 | ||||
285 | /* Child */ | |||
286 | setproctitle("%s", vcp->vcp_name); | |||
287 | log_procinit(vcp->vcp_name); | |||
288 | ||||
289 | if (!(vm->vm_state & VM_STATE_RECEIVED0x08)) | |||
290 | create_memory_map(vcp); | |||
291 | ||||
292 | ret = alloc_guest_mem(vcp); | |||
293 | ||||
294 | if (ret) { | |||
295 | errno(*__errno()) = ret; | |||
296 | fatal("could not allocate guest memory - exiting"); | |||
297 | } | |||
298 | ||||
299 | ret = vmm_create_vm(vcp); | |||
300 | current_vm = vm; | |||
301 | ||||
302 | /* send back the kernel-generated vm id (0 on error) */ | |||
303 | if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != | |||
304 | sizeof(vcp->vcp_id)) | |||
305 | fatal("write vcp id"); | |||
306 | ||||
307 | if (ret) { | |||
308 | errno(*__errno()) = ret; | |||
309 | fatal("create vmm ioctl failed - exiting"); | |||
310 | } | |||
311 | ||||
312 | /* | |||
313 | * pledge in the vm processes: | |||
314 | * stdio - for malloc and basic I/O including events. | |||
315 | * recvfd - for send/recv. | |||
316 | * vmm - for the vmm ioctls and operations. | |||
317 | */ | |||
318 | if (pledge("stdio vmm recvfd", NULL((void*)0)) == -1) | |||
319 | fatal("pledge"); | |||
320 | ||||
321 | if (vm->vm_state & VM_STATE_RECEIVED0x08) { | |||
322 | ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp)); | |||
323 | if (ret != sizeof(vrp)) { | |||
324 | fatal("received incomplete vrp - exiting"); | |||
325 | } | |||
326 | vrs = vrp.vrwp_regs; | |||
327 | } else { | |||
328 | /* | |||
329 | * Set up default "flat 64 bit" register state - RIP, | |||
330 | * RSP, and GDT info will be set in bootloader | |||
331 | */ | |||
332 | memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); | |||
333 | ||||
334 | /* Find and open kernel image */ | |||
335 | if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL((void*)0)) | |||
336 | fatalx("failed to open kernel - exiting"); | |||
337 | ||||
338 | /* Load kernel image */ | |||
339 | ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice); | |||
340 | ||||
341 | /* | |||
342 | * Try BIOS as a fallback (only if it was provided as an image | |||
343 | * with vm->vm_kernel and the file is not compressed) | |||
344 | */ | |||
345 | if (ret && errno(*__errno()) == ENOEXEC8 && vm->vm_kernel != -1 && | |||
346 | gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) | |||
347 | ret = loadfile_bios(fp, sb.st_size, &vrs); | |||
348 | ||||
349 | if (ret) | |||
350 | fatal("failed to load kernel or BIOS - exiting"); | |||
351 | ||||
352 | gzclose(fp); | |||
353 | } | |||
354 | ||||
355 | if (vm->vm_kernel != -1) | |||
356 | close(vm->vm_kernel); | |||
357 | ||||
358 | con_fd = vm->vm_tty; | |||
359 | if (fcntl(con_fd, F_SETFL4, O_NONBLOCK0x0004) == -1) | |||
360 | fatal("failed to set nonblocking mode on console"); | |||
361 | ||||
362 | for (i = 0; i < VMM_MAX_NICS_PER_VM4; i++) | |||
363 | nicfds[i] = vm->vm_ifs[i].vif_fd; | |||
364 | ||||
365 | event_init(); | |||
366 | ||||
367 | if (vm->vm_state & VM_STATE_RECEIVED0x08) { | |||
368 | restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, | |||
369 | vm->vm_disks, vm->vm_cdrom); | |||
370 | restore_mem(vm->vm_receive_fd, vcp); | |||
371 | if (restore_vm_params(vm->vm_receive_fd, vcp)) | |||
372 | fatal("restore vm params failed"); | |||
373 | unpause_vm(vcp); | |||
374 | } | |||
375 | ||||
376 | if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) | |||
377 | fatal("setup vm pipe"); | |||
378 | ||||
379 | /* Execute the vcpu run loop(s) for this VM */ | |||
380 | ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); | |||
381 | ||||
382 | /* Ensure that any in-flight data is written back */ | |||
383 | virtio_shutdown(vm); | |||
384 | ||||
385 | return (ret); | |||
386 | } | |||
387 | ||||
388 | /* | |||
389 | * vm_dispatch_vmm | |||
390 | * | |||
391 | * imsg callback for messages that are received from the vmm parent process. | |||
392 | */ | |||
393 | void | |||
394 | vm_dispatch_vmm(int fd, short event, void *arg) | |||
395 | { | |||
396 | struct vmd_vm *vm = arg; | |||
397 | struct vmop_result vmr; | |||
398 | struct vmop_addr_result var; | |||
399 | struct imsgev *iev = &vm->vm_iev; | |||
400 | struct imsgbuf *ibuf = &iev->ibuf; | |||
401 | struct imsg imsg; | |||
402 | ssize_t n; | |||
403 | int verbose; | |||
404 | ||||
405 | if (event & EV_READ0x02) { | |||
406 | if ((n = imsg_read(ibuf)) == -1 && errno(*__errno()) != EAGAIN35) | |||
407 | fatal("%s: imsg_read", __func__); | |||
408 | if (n == 0) | |||
409 | _exit(0); | |||
410 | } | |||
411 | ||||
412 | if (event & EV_WRITE0x04) { | |||
413 | if ((n = msgbuf_write(&ibuf->w)) == -1 && errno(*__errno()) != EAGAIN35) | |||
414 | fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); | |||
415 | if (n == 0) | |||
416 | _exit(0); | |||
417 | } | |||
418 | ||||
419 | for (;;) { | |||
420 | if ((n = imsg_get(ibuf, &imsg)) == -1) | |||
421 | fatal("%s: imsg_get", __func__); | |||
422 | if (n == 0) | |||
423 | break; | |||
424 | ||||
425 | #if DEBUG > 1 | |||
426 | log_debug("%s: got imsg %d from %s", | |||
427 | __func__, imsg.hdr.type, | |||
428 | vm->vm_params.vmc_params.vcp_name); | |||
429 | #endif | |||
430 | ||||
431 | switch (imsg.hdr.type) { | |||
432 | case IMSG_CTL_VERBOSE: | |||
433 | IMSG_SIZE_CHECK(&imsg, &verbose)do { if (((&imsg)->hdr.len - sizeof(struct imsg_hdr)) < sizeof(*&verbose)) fatalx("bad length imsg received (%s)" , "&verbose"); } while (0); | |||
434 | memcpy(&verbose, imsg.data, sizeof(verbose)); | |||
435 | log_setverbose(verbose); | |||
436 | break; | |||
437 | case IMSG_VMDOP_VM_SHUTDOWN: | |||
438 | if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) | |||
439 | _exit(0); | |||
440 | break; | |||
441 | case IMSG_VMDOP_VM_REBOOT: | |||
442 | if (vmmci_ctl(VMMCI_REBOOT) == -1) | |||
443 | _exit(0); | |||
444 | break; | |||
445 | case IMSG_VMDOP_PAUSE_VM: | |||
446 | vmr.vmr_result = 0; | |||
447 | vmr.vmr_id = vm->vm_vmid; | |||
448 | pause_vm(&vm->vm_params.vmc_params); | |||
449 | imsg_compose_event(&vm->vm_iev, | |||
450 | IMSG_VMDOP_PAUSE_VM_RESPONSE, | |||
451 | imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, | |||
452 | sizeof(vmr)); | |||
453 | break; | |||
454 | case IMSG_VMDOP_UNPAUSE_VM: | |||
455 | vmr.vmr_result = 0; | |||
456 | vmr.vmr_id = vm->vm_vmid; | |||
457 | unpause_vm(&vm->vm_params.vmc_params); | |||
458 | imsg_compose_event(&vm->vm_iev, | |||
459 | IMSG_VMDOP_UNPAUSE_VM_RESPONSE, | |||
460 | imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, | |||
461 | sizeof(vmr)); | |||
462 | break; | |||
463 | case IMSG_VMDOP_SEND_VM_REQUEST: | |||
464 | vmr.vmr_id = vm->vm_vmid; | |||
465 | vmr.vmr_result = send_vm(imsg.fd, | |||
466 | &vm->vm_params.vmc_params); | |||
467 | imsg_compose_event(&vm->vm_iev, | |||
468 | IMSG_VMDOP_SEND_VM_RESPONSE, | |||
469 | imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, | |||
470 | sizeof(vmr)); | |||
471 | if (!vmr.vmr_result) { | |||
472 | imsg_flush(¤t_vm->vm_iev.ibuf); | |||
473 | _exit(0); | |||
474 | } | |||
475 | break; | |||
476 | case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: | |||
477 | IMSG_SIZE_CHECK(&imsg, &var)do { if (((&imsg)->hdr.len - sizeof(struct imsg_hdr)) < sizeof(*&var)) fatalx("bad length imsg received (%s)", "&var" ); } while (0); | |||
478 | memcpy(&var, imsg.data, sizeof(var)); | |||
479 | ||||
480 | log_debug("%s: received tap addr %s for nic %d", | |||
481 | vm->vm_params.vmc_params.vcp_name, | |||
482 | ether_ntoa((void *)var.var_addr), var.var_nic_idx); | |||
483 | ||||
484 | vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); | |||
485 | break; | |||
486 | default: | |||
487 | fatalx("%s: got invalid imsg %d from %s", | |||
488 | __func__, imsg.hdr.type, | |||
489 | vm->vm_params.vmc_params.vcp_name); | |||
490 | } | |||
491 | imsg_free(&imsg); | |||
492 | } | |||
493 | imsg_event_add(iev); | |||
494 | } | |||
495 | ||||
496 | /* | |||
497 | * vm_shutdown | |||
498 | * | |||
499 | * Tell the vmm parent process to shutdown or reboot the VM and exit. | |||
500 | */ | |||
501 | __dead__attribute__((__noreturn__)) void | |||
502 | vm_shutdown(unsigned int cmd) | |||
503 | { | |||
504 | switch (cmd) { | |||
505 | case VMMCI_NONE: | |||
506 | case VMMCI_SHUTDOWN: | |||
507 | (void)imsg_compose_event(¤t_vm->vm_iev, | |||
508 | IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL((void*)0), 0); | |||
509 | break; | |||
510 | case VMMCI_REBOOT: | |||
511 | (void)imsg_compose_event(¤t_vm->vm_iev, | |||
512 | IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL((void*)0), 0); | |||
513 | break; | |||
514 | default: | |||
515 | fatalx("invalid vm ctl command: %d", cmd); | |||
516 | } | |||
517 | imsg_flush(¤t_vm->vm_iev.ibuf); | |||
518 | ||||
519 | _exit(0); | |||
520 | } | |||
521 | ||||
522 | int | |||
523 | send_vm(int fd, struct vm_create_params *vcp) | |||
524 | { | |||
525 | struct vm_rwregs_params vrp; | |||
526 | struct vm_rwvmparams_params vpp; | |||
527 | struct vmop_create_params *vmc; | |||
528 | struct vm_terminate_params vtp; | |||
529 | unsigned int flags = 0; | |||
530 | unsigned int i; | |||
531 | int ret = 0; | |||
532 | size_t sz; | |||
533 | ||||
534 | if (dump_send_header(fd)) { | |||
535 | log_info("%s: failed to send vm dump header", __func__); | |||
536 | goto err; | |||
537 | } | |||
538 | ||||
539 | pause_vm(vcp); | |||
540 | ||||
541 | vmc = calloc(1, sizeof(struct vmop_create_params)); | |||
542 | if (vmc == NULL((void*)0)) { | |||
543 | log_warn("%s: calloc error geting vmc", __func__); | |||
544 | ret = -1; | |||
545 | goto err; | |||
546 | } | |||
547 | ||||
548 | flags |= VMOP_CREATE_MEMORY0x04; | |||
549 | memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct | |||
550 | vmop_create_params)); | |||
551 | vmc->vmc_flags = flags; | |||
552 | vrp.vrwp_vm_id = vcp->vcp_id; | |||
553 | vrp.vrwp_mask = VM_RWREGS_ALL(0x1 | 0x2 | 0x4 | 0x8 | 0x10); | |||
554 | vpp.vpp_mask = VM_RWVMPARAMS_ALL(0x1 | 0x2); | |||
555 | vpp.vpp_vm_id = vcp->vcp_id; | |||
556 | ||||
557 | sz = atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, vmc,sizeof(struct vmop_create_params)); | |||
558 | if (sz != sizeof(struct vmop_create_params)) { | |||
559 | ret = -1; | |||
560 | goto err; | |||
561 | } | |||
562 | ||||
563 | for (i = 0; i < vcp->vcp_ncpus; i++) { | |||
564 | vrp.vrwp_vcpu_id = i; | |||
565 | if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof (struct vm_rwregs_params) & 0x1fff) << 16) | ((('V' )) << 8) | ((7))), &vrp))) { | |||
566 | log_warn("%s: readregs failed", __func__); | |||
567 | goto err; | |||
568 | } | |||
569 | ||||
570 | sz = atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, &vrp, | |||
571 | sizeof(struct vm_rwregs_params)); | |||
572 | if (sz != sizeof(struct vm_rwregs_params)) { | |||
573 | log_warn("%s: dumping registers failed", __func__); | |||
574 | ret = -1; | |||
575 | goto err; | |||
576 | } | |||
577 | } | |||
578 | ||||
579 | if ((ret = i8253_dump(fd))) | |||
580 | goto err; | |||
581 | if ((ret = i8259_dump(fd))) | |||
582 | goto err; | |||
583 | if ((ret = ns8250_dump(fd))) | |||
584 | goto err; | |||
585 | if ((ret = mc146818_dump(fd))) | |||
586 | goto err; | |||
587 | if ((ret = fw_cfg_dump(fd))) | |||
588 | goto err; | |||
589 | if ((ret = pci_dump(fd))) | |||
590 | goto err; | |||
591 | if ((ret = virtio_dump(fd))) | |||
592 | goto err; | |||
593 | if ((ret = dump_mem(fd, vcp))) | |||
594 | goto err; | |||
595 | ||||
596 | for (i = 0; i < vcp->vcp_ncpus; i++) { | |||
597 | vpp.vpp_vcpu_id = i; | |||
598 | if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof (struct vm_rwvmparams_params) & 0x1fff) << 16) | (( ('V')) << 8) | ((9))), &vpp))) { | |||
599 | log_warn("%s: readvmparams failed", __func__); | |||
600 | goto err; | |||
601 | } | |||
602 | ||||
603 | sz = atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, &vpp, | |||
604 | sizeof(struct vm_rwvmparams_params)); | |||
605 | if (sz != sizeof(struct vm_rwvmparams_params)) { | |||
606 | log_warn("%s: dumping vm params failed", __func__); | |||
607 | ret = -1; | |||
608 | goto err; | |||
609 | } | |||
610 | } | |||
611 | ||||
612 | vtp.vtp_vm_id = vcp->vcp_id; | |||
613 | if (ioctl(env->vmd_fd, VMM_IOC_TERM((unsigned long)0x80000000 | ((sizeof(struct vm_terminate_params ) & 0x1fff) << 16) | ((('V')) << 8) | ((4))), &vtp) == -1) { | |||
614 | log_warnx("%s: term IOC error: %d, %d", __func__, | |||
615 | errno(*__errno()), ENOENT2); | |||
616 | } | |||
617 | err: | |||
618 | close(fd); | |||
619 | if (ret) | |||
620 | unpause_vm(vcp); | |||
621 | return ret; | |||
622 | } | |||
623 | ||||
624 | int | |||
625 | dump_send_header(int fd) { | |||
626 | struct vm_dump_header vmh; | |||
627 | int i; | |||
628 | ||||
629 | memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE"OpenBSDVMM58", | |||
630 | sizeof(vmh.vmh_signature)); | |||
631 | ||||
632 | vmh.vmh_cpuids[0].code = 0x00; | |||
633 | vmh.vmh_cpuids[0].leaf = 0x00; | |||
634 | ||||
635 | vmh.vmh_cpuids[1].code = 0x01; | |||
636 | vmh.vmh_cpuids[1].leaf = 0x00; | |||
637 | ||||
638 | vmh.vmh_cpuids[2].code = 0x07; | |||
639 | vmh.vmh_cpuids[2].leaf = 0x00; | |||
640 | ||||
641 | vmh.vmh_cpuids[3].code = 0x0d; | |||
642 | vmh.vmh_cpuids[3].leaf = 0x00; | |||
643 | ||||
644 | vmh.vmh_cpuids[4].code = 0x80000001; | |||
645 | vmh.vmh_cpuids[4].leaf = 0x00; | |||
646 | ||||
647 | vmh.vmh_version = VM_DUMP_VERSION7; | |||
648 | ||||
649 | for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT5; i++) { | |||
650 | CPUID_LEAF(vmh.vmh_cpuids[i].code,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh .vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids [i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i] .leaf)) | |||
651 | vmh.vmh_cpuids[i].leaf,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh .vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids [i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i] .leaf)) | |||
652 | vmh.vmh_cpuids[i].a,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh .vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids [i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i] .leaf)) | |||
653 | vmh.vmh_cpuids[i].b,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh .vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids [i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i] .leaf)) | |||
654 | vmh.vmh_cpuids[i].c,__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh .vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids [i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i] .leaf)) | |||
655 | vmh.vmh_cpuids[i].d)__asm volatile("cpuid" : "=a" (vmh.vmh_cpuids[i].a), "=b" (vmh .vmh_cpuids[i].b), "=c" (vmh.vmh_cpuids[i].c), "=d" (vmh.vmh_cpuids [i].d) : "a" (vmh.vmh_cpuids[i].code), "c" (vmh.vmh_cpuids[i] .leaf)); | |||
656 | } | |||
657 | ||||
658 | if (atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) | |||
659 | return (-1); | |||
660 | ||||
661 | return (0); | |||
662 | } | |||
663 | ||||
664 | int | |||
665 | dump_mem(int fd, struct vm_create_params *vcp) | |||
666 | { | |||
667 | unsigned int i; | |||
668 | int ret; | |||
669 | struct vm_mem_range *vmr; | |||
670 | ||||
671 | for (i = 0; i < vcp->vcp_nmemranges; i++) { | |||
672 | vmr = &vcp->vcp_memranges[i]; | |||
673 | ret = dump_vmr(fd, vmr); | |||
674 | if (ret) | |||
675 | return ret; | |||
676 | } | |||
677 | return (0); | |||
678 | } | |||
679 | ||||
680 | int | |||
681 | restore_vm_params(int fd, struct vm_create_params *vcp) { | |||
682 | unsigned int i; | |||
683 | struct vm_rwvmparams_params vpp; | |||
684 | ||||
685 | for (i = 0; i < vcp->vcp_ncpus; i++) { | |||
686 | if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { | |||
687 | log_warn("%s: error restoring vm params", __func__); | |||
688 | return (-1); | |||
689 | } | |||
690 | vpp.vpp_vm_id = vcp->vcp_id; | |||
691 | vpp.vpp_vcpu_id = i; | |||
692 | if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS((unsigned long)0x80000000 | ((sizeof(struct vm_rwvmparams_params ) & 0x1fff) << 16) | ((('V')) << 8) | ((10))), &vpp) < 0) { | |||
693 | log_debug("%s: writing vm params failed", __func__); | |||
694 | return (-1); | |||
695 | } | |||
696 | } | |||
697 | return (0); | |||
698 | } | |||
699 | ||||
700 | void | |||
701 | restore_mem(int fd, struct vm_create_params *vcp) | |||
702 | { | |||
703 | unsigned int i; | |||
704 | struct vm_mem_range *vmr; | |||
705 | ||||
706 | for (i = 0; i < vcp->vcp_nmemranges; i++) { | |||
707 | vmr = &vcp->vcp_memranges[i]; | |||
708 | restore_vmr(fd, vmr); | |||
709 | } | |||
710 | } | |||
711 | ||||
712 | int | |||
713 | dump_vmr(int fd, struct vm_mem_range *vmr) | |||
714 | { | |||
715 | size_t rem = vmr->vmr_size, read=0; | |||
716 | char buf[PAGE_SIZE(1 << 12)]; | |||
717 | ||||
718 | while (rem > 0) { | |||
719 | if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE(1 << 12))) { | |||
720 | log_warn("failed to read vmr"); | |||
721 | return (-1); | |||
722 | } | |||
723 | if (atomicio(vwrite(ssize_t (*)(int, void *, size_t))write, fd, buf, sizeof(buf)) != sizeof(buf)) { | |||
724 | log_warn("failed to dump vmr"); | |||
725 | return (-1); | |||
726 | } | |||
727 | rem = rem - PAGE_SIZE(1 << 12); | |||
728 | read = read + PAGE_SIZE(1 << 12); | |||
729 | } | |||
730 | return (0); | |||
731 | } | |||
732 | ||||
733 | void | |||
734 | restore_vmr(int fd, struct vm_mem_range *vmr) | |||
735 | { | |||
736 | size_t rem = vmr->vmr_size, wrote=0; | |||
737 | char buf[PAGE_SIZE(1 << 12)]; | |||
738 | ||||
739 | while (rem > 0) { | |||
740 | if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) | |||
741 | fatal("failed to restore vmr"); | |||
742 | if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE(1 << 12))) | |||
743 | fatal("failed to write vmr"); | |||
744 | rem = rem - PAGE_SIZE(1 << 12); | |||
745 | wrote = wrote + PAGE_SIZE(1 << 12); | |||
746 | } | |||
747 | } | |||
748 | ||||
749 | void | |||
750 | pause_vm(struct vm_create_params *vcp) | |||
751 | { | |||
752 | unsigned int n; | |||
753 | int ret; | |||
754 | if (current_vm->vm_state & VM_STATE_PAUSED0x10) | |||
755 | return; | |||
756 | ||||
757 | current_vm->vm_state |= VM_STATE_PAUSED0x10; | |||
758 | ||||
759 | ret = pthread_barrier_init(&vm_pause_barrier, NULL((void*)0), vcp->vcp_ncpus + 1); | |||
760 | if (ret) { | |||
761 | log_warnx("%s: cannot initialize pause barrier (%d)", | |||
762 | __progname, ret); | |||
763 | return; | |||
764 | } | |||
765 | ||||
766 | for (n = 0; n < vcp->vcp_ncpus; n++) { | |||
767 | ret = pthread_cond_broadcast(&vcpu_run_cond[n]); | |||
768 | if (ret) { | |||
769 | log_warnx("%s: can't broadcast vcpu run cond (%d)", | |||
770 | __func__, (int)ret); | |||
771 | return; | |||
772 | } | |||
773 | } | |||
774 | ret = pthread_barrier_wait(&vm_pause_barrier); | |||
775 | if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD-1) { | |||
776 | log_warnx("%s: could not wait on pause barrier (%d)", | |||
777 | __func__, (int)ret); | |||
778 | return; | |||
779 | } | |||
780 | ||||
781 | ret = pthread_barrier_destroy(&vm_pause_barrier); | |||
782 | if (ret) { | |||
783 | log_warnx("%s: could not destroy pause barrier (%d)", | |||
784 | __progname, ret); | |||
785 | return; | |||
786 | } | |||
787 | ||||
788 | i8253_stop(); | |||
789 | mc146818_stop(); | |||
790 | ns8250_stop(); | |||
791 | virtio_stop(vcp); | |||
792 | } | |||
793 | ||||
794 | void | |||
795 | unpause_vm(struct vm_create_params *vcp) | |||
796 | { | |||
797 | unsigned int n; | |||
798 | int ret; | |||
799 | if (!(current_vm->vm_state & VM_STATE_PAUSED0x10)) | |||
800 | return; | |||
801 | ||||
802 | current_vm->vm_state &= ~VM_STATE_PAUSED0x10; | |||
803 | for (n = 0; n < vcp->vcp_ncpus; n++) { | |||
804 | ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); | |||
805 | if (ret) { | |||
806 | log_warnx("%s: can't broadcast vcpu unpause cond (%d)", | |||
807 | __func__, (int)ret); | |||
808 | return; | |||
809 | } | |||
810 | } | |||
811 | ||||
812 | i8253_start(); | |||
813 | mc146818_start(); | |||
814 | ns8250_start(); | |||
815 | virtio_start(vcp); | |||
816 | } | |||
817 | ||||
818 | /* | |||
819 | * vcpu_reset | |||
820 | * | |||
821 | * Requests vmm(4) to reset the VCPUs in the indicated VM to | |||
822 | * the register state provided | |||
823 | * | |||
824 | * Parameters | |||
825 | * vmid: VM ID to reset | |||
826 | * vcpu_id: VCPU ID to reset | |||
827 | * vrs: the register state to initialize | |||
828 | * | |||
829 | * Return values: | |||
830 | * 0: success | |||
831 | * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not | |||
832 | * valid) | |||
833 | */ | |||
834 | int | |||
835 | vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) | |||
836 | { | |||
837 | struct vm_resetcpu_params vrp; | |||
838 | ||||
839 | memset(&vrp, 0, sizeof(vrp)); | |||
840 | vrp.vrp_vm_id = vmid; | |||
841 | vrp.vrp_vcpu_id = vcpu_id; | |||
842 | memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); | |||
843 | ||||
844 | log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); | |||
845 | ||||
846 | if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU((unsigned long)0x80000000 | ((sizeof(struct vm_resetcpu_params ) & 0x1fff) << 16) | ((('V')) << 8) | ((5))), &vrp) == -1) | |||
847 | return (errno(*__errno())); | |||
848 | ||||
849 | return (0); | |||
850 | } | |||
851 | ||||
852 | /* | |||
853 | * create_memory_map | |||
854 | * | |||
855 | * Sets up the guest physical memory ranges that the VM can access. | |||
856 | * | |||
857 | * Parameters: | |||
858 | * vcp: VM create parameters describing the VM whose memory map | |||
859 | * is being created | |||
860 | * | |||
861 | * Return values: | |||
862 | * nothing | |||
863 | */ | |||
864 | void | |||
865 | create_memory_map(struct vm_create_params *vcp) | |||
866 | { | |||
867 | size_t len, mem_bytes, mem_mb; | |||
868 | ||||
869 | mem_mb = vcp->vcp_memranges[0].vmr_size; | |||
870 | vcp->vcp_nmemranges = 0; | |||
871 | if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE32768) | |||
872 | return; | |||
873 | ||||
874 | mem_bytes = mem_mb * 1024 * 1024; | |||
875 | ||||
876 | /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ | |||
877 | len = LOWMEM_KB640 * 1024; | |||
878 | vcp->vcp_memranges[0].vmr_gpa = 0x0; | |||
879 | vcp->vcp_memranges[0].vmr_size = len; | |||
880 | mem_bytes -= len; | |||
881 | ||||
882 | /* | |||
883 | * Second memory region: LOWMEM_KB - 1MB. | |||
884 | * | |||
885 | * N.B. - Normally ROMs or parts of video RAM are mapped here. | |||
886 | * We have to add this region, because some systems | |||
887 | * unconditionally write to 0xb8000 (VGA RAM), and | |||
888 | * we need to make sure that vmm(4) permits accesses | |||
889 | * to it. So allocate guest memory for it. | |||
890 | */ | |||
891 | len = 0x100000 - LOWMEM_KB640 * 1024; | |||
892 | vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB640 * 1024; | |||
893 | vcp->vcp_memranges[1].vmr_size = len; | |||
894 | mem_bytes -= len; | |||
895 | ||||
896 | /* Make sure that we do not place physical memory into MMIO ranges. */ | |||
897 | if (mem_bytes > VMM_PCI_MMIO_BAR_BASE0xF0000000ULL - 0x100000) | |||
898 | len = VMM_PCI_MMIO_BAR_BASE0xF0000000ULL - 0x100000; | |||
899 | else | |||
900 | len = mem_bytes; | |||
901 | ||||
902 | /* Third memory region: 1MB - (1MB + len) */ | |||
903 | vcp->vcp_memranges[2].vmr_gpa = 0x100000; | |||
904 | vcp->vcp_memranges[2].vmr_size = len; | |||
905 | mem_bytes -= len; | |||
906 | ||||
907 | if (mem_bytes > 0) { | |||
908 | /* Fourth memory region for the remaining memory (if any) */ | |||
909 | vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END0xFFFFFFFFULL + 1; | |||
910 | vcp->vcp_memranges[3].vmr_size = mem_bytes; | |||
911 | vcp->vcp_nmemranges = 4; | |||
912 | } else | |||
913 | vcp->vcp_nmemranges = 3; | |||
914 | } | |||
915 | ||||
916 | /* | |||
917 | * alloc_guest_mem | |||
918 | * | |||
919 | * Allocates memory for the guest. | |||
920 | * Instead of doing a single allocation with one mmap(), we allocate memory | |||
921 | * separately for every range for the following reasons: | |||
922 | * - ASLR for the individual ranges | |||
923 | * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to | |||
924 | * map the single mmap'd userspace memory to the individual guest physical | |||
925 | * memory ranges, the underlying amap of the single mmap'd range would have | |||
926 | * to allocate per-page reference counters. The reason is that the | |||
927 | * individual guest physical ranges would reference the single mmap'd region | |||
928 | * only partially. However, if every guest physical range has its own | |||
929 | * corresponding mmap'd userspace allocation, there are no partial | |||
930 | * references: every guest physical range fully references an mmap'd | |||
931 | * range => no per-page reference counters have to be allocated. | |||
932 | * | |||
933 | * Return values: | |||
934 | * 0: success | |||
935 | * !0: failure - errno indicating the source of the failure | |||
936 | */ | |||
937 | int | |||
938 | alloc_guest_mem(struct vm_create_params *vcp) | |||
939 | { | |||
940 | void *p; | |||
941 | int ret; | |||
942 | size_t i, j; | |||
943 | struct vm_mem_range *vmr; | |||
944 | ||||
945 | for (i = 0; i < vcp->vcp_nmemranges; i++) { | |||
946 | vmr = &vcp->vcp_memranges[i]; | |||
947 | p = mmap(NULL((void*)0), vmr->vmr_size, PROT_READ0x01 | PROT_WRITE0x02, | |||
948 | MAP_PRIVATE0x0002 | MAP_ANON0x1000, -1, 0); | |||
949 | if (p == MAP_FAILED((void *)-1)) { | |||
950 | ret = errno(*__errno()); | |||
951 | for (j = 0; j < i; j++) { | |||
952 | vmr = &vcp->vcp_memranges[j]; | |||
953 | munmap((void *)vmr->vmr_va, vmr->vmr_size); | |||
954 | } | |||
955 | ||||
956 | return (ret); | |||
957 | } | |||
958 | ||||
959 | vmr->vmr_va = (vaddr_t)p; | |||
960 | } | |||
961 | ||||
962 | return (0); | |||
963 | } | |||
964 | ||||
965 | /* | |||
966 | * vmm_create_vm | |||
967 | * | |||
968 | * Requests vmm(4) to create a new VM using the supplied creation | |||
969 | * parameters. This operation results in the creation of the in-kernel | |||
970 | * structures for the VM, but does not start the VM's vcpu(s). | |||
971 | * | |||
972 | * Parameters: | |||
973 | * vcp: vm_create_params struct containing the VM's desired creation | |||
974 | * configuration | |||
975 | * | |||
976 | * Return values: | |||
977 | * 0: success | |||
978 | * !0 : ioctl to vmm(4) failed | |||
979 | */ | |||
980 | int | |||
981 | vmm_create_vm(struct vm_create_params *vcp) | |||
982 | { | |||
983 | /* Sanity check arguments */ | |||
984 | if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM64) | |||
985 | return (EINVAL22); | |||
986 | ||||
987 | if (vcp->vcp_nmemranges == 0 || | |||
988 | vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES16) | |||
989 | return (EINVAL22); | |||
990 | ||||
991 | if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM4) | |||
992 | return (EINVAL22); | |||
993 | ||||
994 | if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM4) | |||
995 | return (EINVAL22); | |||
996 | ||||
997 | if (ioctl(env->vmd_fd, VMM_IOC_CREATE(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof (struct vm_create_params) & 0x1fff) << 16) | ((('V' )) << 8) | ((1))), vcp) == -1) | |||
998 | return (errno(*__errno())); | |||
999 | ||||
1000 | return (0); | |||
1001 | } | |||
1002 | ||||
1003 | /* | |||
1004 | * init_emulated_hw | |||
1005 | * | |||
1006 | * Initializes the userspace hardware emulation | |||
1007 | */ | |||
1008 | void | |||
1009 | init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, | |||
1010 | int child_disks[][VM_MAX_BASE_PER_DISK4], int *child_taps) | |||
1011 | { | |||
1012 | struct vm_create_params *vcp = &vmc->vmc_params; | |||
1013 | int i; | |||
1014 | uint64_t memlo, memhi; | |||
1015 | ||||
1016 | /* Calculate memory size for NVRAM registers */ | |||
1017 | memlo = memhi = 0; | |||
1018 | if (vcp->vcp_nmemranges > 2) | |||
1019 | memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; | |||
1020 | ||||
1021 | if (vcp->vcp_nmemranges > 3) | |||
1022 | memhi = vcp->vcp_memranges[3].vmr_size; | |||
1023 | ||||
1024 | /* Reset the IO port map */ | |||
1025 | memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS65536); | |||
1026 | ||||
1027 | /* Init i8253 PIT */ | |||
1028 | i8253_init(vcp->vcp_id); | |||
1029 | ioports_map[TIMER_CTRL0x43] = vcpu_exit_i8253; | |||
1030 | ioports_map[TIMER_BASE0x40 + TIMER_CNTR00] = vcpu_exit_i8253; | |||
1031 | ioports_map[TIMER_BASE0x40 + TIMER_CNTR11] = vcpu_exit_i8253; | |||
1032 | ioports_map[TIMER_BASE0x40 + TIMER_CNTR22] = vcpu_exit_i8253; | |||
1033 | ioports_map[PCKBC_AUX0x61] = vcpu_exit_i8253_misc; | |||
1034 | ||||
1035 | /* Init mc146818 RTC */ | |||
1036 | mc146818_init(vcp->vcp_id, memlo, memhi); | |||
1037 | ioports_map[IO_RTC0x070] = vcpu_exit_mc146818; | |||
1038 | ioports_map[IO_RTC0x070 + 1] = vcpu_exit_mc146818; | |||
1039 | ||||
1040 | /* Init master and slave PICs */ | |||
1041 | i8259_init(); | |||
1042 | ioports_map[IO_ICU10x020] = vcpu_exit_i8259; | |||
1043 | ioports_map[IO_ICU10x020 + 1] = vcpu_exit_i8259; | |||
1044 | ioports_map[IO_ICU20x0A0] = vcpu_exit_i8259; | |||
1045 | ioports_map[IO_ICU20x0A0 + 1] = vcpu_exit_i8259; | |||
1046 | ioports_map[ELCR00x4D0] = vcpu_exit_elcr; | |||
1047 | ioports_map[ELCR10x4D1] = vcpu_exit_elcr; | |||
1048 | ||||
1049 | /* Init ns8250 UART */ | |||
1050 | ns8250_init(con_fd, vcp->vcp_id); | |||
1051 | for (i = COM1_DATA0x3f8 +0; i <= COM1_SCR0x3f8 +7; i++) | |||
1052 | ioports_map[i] = vcpu_exit_com; | |||
1053 | ||||
1054 | /* Init QEMU fw_cfg interface */ | |||
1055 | fw_cfg_init(vmc); | |||
1056 | ioports_map[FW_CFG_IO_SELECT0x510] = vcpu_exit_fw_cfg; | |||
1057 | ioports_map[FW_CFG_IO_DATA0x511] = vcpu_exit_fw_cfg; | |||
1058 | ioports_map[FW_CFG_IO_DMA_ADDR_HIGH0x514] = vcpu_exit_fw_cfg_dma; | |||
1059 | ioports_map[FW_CFG_IO_DMA_ADDR_LOW0x518] = vcpu_exit_fw_cfg_dma; | |||
1060 | ||||
1061 | /* Initialize PCI */ | |||
1062 | for (i = VMM_PCI_IO_BAR_BASE0x1000; i <= VMM_PCI_IO_BAR_END0xFFFF; i++) | |||
1063 | ioports_map[i] = vcpu_exit_pci; | |||
1064 | ||||
1065 | ioports_map[PCI_MODE1_ADDRESS_REG0x0cf8] = vcpu_exit_pci; | |||
1066 | ioports_map[PCI_MODE1_DATA_REG0x0cfc] = vcpu_exit_pci; | |||
1067 | ioports_map[PCI_MODE1_DATA_REG0x0cfc + 1] = vcpu_exit_pci; | |||
1068 | ioports_map[PCI_MODE1_DATA_REG0x0cfc + 2] = vcpu_exit_pci; | |||
1069 | ioports_map[PCI_MODE1_DATA_REG0x0cfc + 3] = vcpu_exit_pci; | |||
1070 | pci_init(); | |||
1071 | ||||
1072 | /* Initialize virtio devices */ | |||
1073 | virtio_init(current_vm, child_cdrom, child_disks, child_taps); | |||
1074 | } | |||
1075 | /* | |||
1076 | * restore_emulated_hw | |||
1077 | * | |||
1078 | * Restores the userspace hardware emulation from fd | |||
1079 | */ | |||
1080 | void | |||
1081 | restore_emulated_hw(struct vm_create_params *vcp, int fd, | |||
1082 | int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK4], int child_cdrom) | |||
1083 | { | |||
1084 | /* struct vm_create_params *vcp = &vmc->vmc_params; */ | |||
1085 | int i; | |||
1086 | memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS65536); | |||
1087 | ||||
1088 | /* Init i8253 PIT */ | |||
1089 | i8253_restore(fd, vcp->vcp_id); | |||
1090 | ioports_map[TIMER_CTRL0x43] = vcpu_exit_i8253; | |||
1091 | ioports_map[TIMER_BASE0x40 + TIMER_CNTR00] = vcpu_exit_i8253; | |||
1092 | ioports_map[TIMER_BASE0x40 + TIMER_CNTR11] = vcpu_exit_i8253; | |||
1093 | ioports_map[TIMER_BASE0x40 + TIMER_CNTR22] = vcpu_exit_i8253; | |||
1094 | ||||
1095 | /* Init master and slave PICs */ | |||
1096 | i8259_restore(fd); | |||
1097 | ioports_map[IO_ICU10x020] = vcpu_exit_i8259; | |||
1098 | ioports_map[IO_ICU10x020 + 1] = vcpu_exit_i8259; | |||
1099 | ioports_map[IO_ICU20x0A0] = vcpu_exit_i8259; | |||
1100 | ioports_map[IO_ICU20x0A0 + 1] = vcpu_exit_i8259; | |||
1101 | ||||
1102 | /* Init ns8250 UART */ | |||
1103 | ns8250_restore(fd, con_fd, vcp->vcp_id); | |||
1104 | for (i = COM1_DATA0x3f8 +0; i <= COM1_SCR0x3f8 +7; i++) | |||
1105 | ioports_map[i] = vcpu_exit_com; | |||
1106 | ||||
1107 | /* Init mc146818 RTC */ | |||
1108 | mc146818_restore(fd, vcp->vcp_id); | |||
1109 | ioports_map[IO_RTC0x070] = vcpu_exit_mc146818; | |||
1110 | ioports_map[IO_RTC0x070 + 1] = vcpu_exit_mc146818; | |||
1111 | ||||
1112 | /* Init QEMU fw_cfg interface */ | |||
1113 | fw_cfg_restore(fd); | |||
1114 | ioports_map[FW_CFG_IO_SELECT0x510] = vcpu_exit_fw_cfg; | |||
1115 | ioports_map[FW_CFG_IO_DATA0x511] = vcpu_exit_fw_cfg; | |||
1116 | ioports_map[FW_CFG_IO_DMA_ADDR_HIGH0x514] = vcpu_exit_fw_cfg_dma; | |||
1117 | ioports_map[FW_CFG_IO_DMA_ADDR_LOW0x518] = vcpu_exit_fw_cfg_dma; | |||
1118 | ||||
1119 | /* Initialize PCI */ | |||
1120 | for (i = VMM_PCI_IO_BAR_BASE0x1000; i <= VMM_PCI_IO_BAR_END0xFFFF; i++) | |||
1121 | ioports_map[i] = vcpu_exit_pci; | |||
1122 | ||||
1123 | ioports_map[PCI_MODE1_ADDRESS_REG0x0cf8] = vcpu_exit_pci; | |||
1124 | ioports_map[PCI_MODE1_DATA_REG0x0cfc] = vcpu_exit_pci; | |||
1125 | ioports_map[PCI_MODE1_DATA_REG0x0cfc + 1] = vcpu_exit_pci; | |||
1126 | ioports_map[PCI_MODE1_DATA_REG0x0cfc + 2] = vcpu_exit_pci; | |||
1127 | ioports_map[PCI_MODE1_DATA_REG0x0cfc + 3] = vcpu_exit_pci; | |||
1128 | pci_restore(fd); | |||
1129 | virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); | |||
1130 | } | |||
1131 | ||||
1132 | /* | |||
1133 | * run_vm | |||
1134 | * | |||
1135 | * Runs the VM whose creation parameters are specified in vcp | |||
1136 | * | |||
1137 | * Parameters: | |||
1138 | * child_cdrom: previously-opened child ISO disk file descriptor | |||
1139 | * child_disks: previously-opened child VM disk file file descriptors | |||
1140 | * child_taps: previously-opened child tap file descriptors | |||
1141 | * vmc: vmop_create_params struct containing the VM's desired creation | |||
1142 | * configuration | |||
1143 | * vrs: VCPU register state to initialize | |||
1144 | * | |||
1145 | * Return values: | |||
1146 | * 0: the VM exited normally | |||
1147 | * !0 : the VM exited abnormally or failed to start | |||
1148 | */ | |||
1149 | int | |||
1150 | run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK4], | |||
1151 | int *child_taps, struct vmop_create_params *vmc, | |||
1152 | struct vcpu_reg_state *vrs) | |||
1153 | { | |||
1154 | struct vm_create_params *vcp = &vmc->vmc_params; | |||
1155 | struct vm_rwregs_params vregsp; | |||
1156 | uint8_t evdone = 0; | |||
1157 | size_t i; | |||
1158 | int ret; | |||
1159 | pthread_t *tid, evtid; | |||
1160 | struct vm_run_params **vrp; | |||
1161 | void *exit_status; | |||
1162 | ||||
1163 | if (vcp == NULL((void*)0)) | |||
| ||||
1164 | return (EINVAL22); | |||
1165 | ||||
1166 | if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) | |||
1167 | return (EINVAL22); | |||
1168 | ||||
1169 | if (child_disks == NULL((void*)0) && vcp->vcp_ndisks != 0) | |||
1170 | return (EINVAL22); | |||
1171 | ||||
1172 | if (child_taps == NULL((void*)0) && vcp->vcp_nnics != 0) | |||
1173 | return (EINVAL22); | |||
1174 | ||||
1175 | if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM64) | |||
1176 | return (EINVAL22); | |||
1177 | ||||
1178 | if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM4) | |||
1179 | return (EINVAL22); | |||
1180 | ||||
1181 | if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM4) | |||
1182 | return (EINVAL22); | |||
1183 | ||||
1184 | if (vcp->vcp_nmemranges == 0 || | |||
1185 | vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES16) | |||
1186 | return (EINVAL22); | |||
1187 | ||||
1188 | tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); | |||
1189 | vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); | |||
1190 | if (tid == NULL((void*)0) || vrp == NULL((void*)0)) { | |||
1191 | log_warn("%s: memory allocation error - exiting.", | |||
| ||||
1192 | __progname); | |||
1193 | return (ENOMEM12); | |||
1194 | } | |||
1195 | ||||
1196 | log_debug("%s: initializing hardware for vm %s", __func__, | |||
1197 | vcp->vcp_name); | |||
1198 | ||||
1199 | if (!(current_vm->vm_state & VM_STATE_RECEIVED0x08)) | |||
1200 | init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); | |||
1201 | ||||
1202 | ret = pthread_mutex_init(&threadmutex, NULL((void*)0)); | |||
1203 | if (ret) { | |||
1204 | log_warn("%s: could not initialize thread state mutex", | |||
1205 | __func__); | |||
1206 | return (ret); | |||
1207 | } | |||
1208 | ret = pthread_cond_init(&threadcond, NULL((void*)0)); | |||
1209 | if (ret) { | |||
1210 | log_warn("%s: could not initialize thread state " | |||
1211 | "condition variable", __func__); | |||
1212 | return (ret); | |||
1213 | } | |||
1214 | ||||
1215 | mutex_lock(&threadmutex); | |||
1216 | ||||
1217 | log_debug("%s: starting vcpu threads for vm %s", __func__, | |||
1218 | vcp->vcp_name); | |||
1219 | ||||
1220 | /* | |||
1221 | * Create and launch one thread for each VCPU. These threads may | |||
1222 | * migrate between PCPUs over time; the need to reload CPU state | |||
1223 | * in such situations is detected and performed by vmm(4) in the | |||
1224 | * kernel. | |||
1225 | */ | |||
1226 | for (i = 0 ; i < vcp->vcp_ncpus; i++) { | |||
1227 | vrp[i] = malloc(sizeof(struct vm_run_params)); | |||
1228 | if (vrp[i] == NULL((void*)0)) { | |||
1229 | log_warn("%s: memory allocation error - " | |||
1230 | "exiting.", __progname); | |||
1231 | /* caller will exit, so skip freeing */ | |||
1232 | return (ENOMEM12); | |||
1233 | } | |||
1234 | vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); | |||
1235 | if (vrp[i]->vrp_exit == NULL((void*)0)) { | |||
1236 | log_warn("%s: memory allocation error - " | |||
1237 | "exiting.", __progname); | |||
1238 | /* caller will exit, so skip freeing */ | |||
1239 | return (ENOMEM12); | |||
1240 | } | |||
1241 | vrp[i]->vrp_vm_id = vcp->vcp_id; | |||
1242 | vrp[i]->vrp_vcpu_id = i; | |||
1243 | ||||
1244 | if (vcpu_reset(vcp->vcp_id, i, vrs)) { | |||
1245 | log_warnx("%s: cannot reset VCPU %zu - exiting.", | |||
1246 | __progname, i); | |||
1247 | return (EIO5); | |||
1248 | } | |||
1249 | ||||
1250 | /* once more because reset_cpu changes regs */ | |||
1251 | if (current_vm->vm_state & VM_STATE_RECEIVED0x08) { | |||
1252 | vregsp.vrwp_vm_id = vcp->vcp_id; | |||
1253 | vregsp.vrwp_vcpu_id = i; | |||
1254 | vregsp.vrwp_regs = *vrs; | |||
1255 | vregsp.vrwp_mask = VM_RWREGS_ALL(0x1 | 0x2 | 0x4 | 0x8 | 0x10); | |||
1256 | if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS((unsigned long)0x80000000 | ((sizeof(struct vm_rwregs_params ) & 0x1fff) << 16) | ((('V')) << 8) | ((8))), | |||
1257 | &vregsp)) == -1) { | |||
1258 | log_warn("%s: writeregs failed", __func__); | |||
1259 | return (ret); | |||
1260 | } | |||
1261 | } | |||
1262 | ||||
1263 | ret = pthread_cond_init(&vcpu_run_cond[i], NULL((void*)0)); | |||
1264 | if (ret) { | |||
1265 | log_warnx("%s: cannot initialize cond var (%d)", | |||
1266 | __progname, ret); | |||
1267 | return (ret); | |||
1268 | } | |||
1269 | ||||
1270 | ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL((void*)0)); | |||
1271 | if (ret) { | |||
1272 | log_warnx("%s: cannot initialize mtx (%d)", | |||
1273 | __progname, ret); | |||
1274 | return (ret); | |||
1275 | } | |||
1276 | ||||
1277 | ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL((void*)0)); | |||
1278 | if (ret) { | |||
1279 | log_warnx("%s: cannot initialize unpause var (%d)", | |||
1280 | __progname, ret); | |||
1281 | return (ret); | |||
1282 | } | |||
1283 | ||||
1284 | ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL((void*)0)); | |||
1285 | if (ret) { | |||
1286 | log_warnx("%s: cannot initialize unpause mtx (%d)", | |||
1287 | __progname, ret); | |||
1288 | return (ret); | |||
1289 | } | |||
1290 | ||||
1291 | vcpu_hlt[i] = 0; | |||
1292 | ||||
1293 | /* Start each VCPU run thread at vcpu_run_loop */ | |||
1294 | ret = pthread_create(&tid[i], NULL((void*)0), vcpu_run_loop, vrp[i]); | |||
1295 | if (ret) { | |||
1296 | /* caller will _exit after this return */ | |||
1297 | ret = errno(*__errno()); | |||
1298 | log_warn("%s: could not create vcpu thread %zu", | |||
1299 | __func__, i); | |||
1300 | return (ret); | |||
1301 | } | |||
1302 | } | |||
1303 | ||||
1304 | log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); | |||
1305 | ret = pthread_create(&evtid, NULL((void*)0), event_thread, &evdone); | |||
1306 | if (ret) { | |||
1307 | errno(*__errno()) = ret; | |||
1308 | log_warn("%s: could not create event thread", __func__); | |||
1309 | return (ret); | |||
1310 | } | |||
1311 | ||||
1312 | for (;;) { | |||
1313 | ret = pthread_cond_wait(&threadcond, &threadmutex); | |||
1314 | if (ret) { | |||
1315 | log_warn("%s: waiting on thread state condition " | |||
1316 | "variable failed", __func__); | |||
1317 | return (ret); | |||
1318 | } | |||
1319 | ||||
1320 | /* | |||
1321 | * Did a VCPU thread exit with an error? => return the first one | |||
1322 | */ | |||
1323 | for (i = 0; i < vcp->vcp_ncpus; i++) { | |||
1324 | if (vcpu_done[i] == 0) | |||
1325 | continue; | |||
1326 | ||||
1327 | if (pthread_join(tid[i], &exit_status)) { | |||
1328 | log_warn("%s: failed to join thread %zd - " | |||
1329 | "exiting", __progname, i); | |||
1330 | return (EIO5); | |||
1331 | } | |||
1332 | ||||
1333 | ret = (intptr_t)exit_status; | |||
1334 | } | |||
1335 | ||||
1336 | /* Did the event thread exit? => return with an error */ | |||
1337 | if (evdone) { | |||
1338 | if (pthread_join(evtid, &exit_status)) { | |||
1339 | log_warn("%s: failed to join event thread - " | |||
1340 | "exiting", __progname); | |||
1341 | return (EIO5); | |||
1342 | } | |||
1343 | ||||
1344 | log_warnx("%s: vm %d event thread exited " | |||
1345 | "unexpectedly", __progname, vcp->vcp_id); | |||
1346 | return (EIO5); | |||
1347 | } | |||
1348 | ||||
1349 | /* Did all VCPU threads exit successfully? => return */ | |||
1350 | for (i = 0; i < vcp->vcp_ncpus; i++) { | |||
1351 | if (vcpu_done[i] == 0) | |||
1352 | break; | |||
1353 | } | |||
1354 | if (i == vcp->vcp_ncpus) | |||
1355 | return (ret); | |||
1356 | ||||
1357 | /* Some more threads to wait for, start over */ | |||
1358 | } | |||
1359 | ||||
1360 | return (ret); | |||
1361 | } | |||
1362 | ||||
1363 | void * | |||
1364 | event_thread(void *arg) | |||
1365 | { | |||
1366 | uint8_t *donep = arg; | |||
1367 | intptr_t ret; | |||
1368 | ||||
1369 | ret = event_dispatch(); | |||
1370 | ||||
1371 | mutex_lock(&threadmutex); | |||
1372 | *donep = 1; | |||
1373 | pthread_cond_signal(&threadcond); | |||
1374 | mutex_unlock(&threadmutex); | |||
1375 | ||||
1376 | return (void *)ret; | |||
1377 | } | |||
1378 | ||||
1379 | /* | |||
1380 | * vcpu_run_loop | |||
1381 | * | |||
1382 | * Runs a single VCPU until vmm(4) requires help handling an exit, | |||
1383 | * or the VM terminates. | |||
1384 | * | |||
1385 | * Parameters: | |||
1386 | * arg: vcpu_run_params for the VCPU being run by this thread | |||
1387 | * | |||
1388 | * Return values: | |||
1389 | * NULL: the VCPU shutdown properly | |||
1390 | * !NULL: error processing VCPU run, or the VCPU shutdown abnormally | |||
1391 | */ | |||
1392 | void * | |||
1393 | vcpu_run_loop(void *arg) | |||
1394 | { | |||
1395 | struct vm_run_params *vrp = (struct vm_run_params *)arg; | |||
1396 | intptr_t ret = 0; | |||
1397 | int irq; | |||
1398 | uint32_t n; | |||
1399 | ||||
1400 | vrp->vrp_continue = 0; | |||
1401 | n = vrp->vrp_vcpu_id; | |||
1402 | ||||
1403 | for (;;) { | |||
1404 | ret = pthread_mutex_lock(&vcpu_run_mtx[n]); | |||
1405 | ||||
1406 | if (ret) { | |||
1407 | log_warnx("%s: can't lock vcpu run mtx (%d)", | |||
1408 | __func__, (int)ret); | |||
1409 | return ((void *)ret); | |||
1410 | } | |||
1411 | ||||
1412 | /* If we are halted and need to pause, pause */ | |||
1413 | if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED0x10)) { | |||
1414 | ret = pthread_barrier_wait(&vm_pause_barrier); | |||
1415 | if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD-1) { | |||
1416 | log_warnx("%s: could not wait on pause barrier (%d)", | |||
1417 | __func__, (int)ret); | |||
1418 | return ((void *)ret); | |||
1419 | } | |||
1420 | ||||
1421 | ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); | |||
1422 | if (ret) { | |||
1423 | log_warnx("%s: can't lock vcpu unpause mtx (%d)", | |||
1424 | __func__, (int)ret); | |||
1425 | return ((void *)ret); | |||
1426 | } | |||
1427 | ||||
1428 | ret = pthread_cond_wait(&vcpu_unpause_cond[n], | |||
1429 | &vcpu_unpause_mtx[n]); | |||
1430 | if (ret) { | |||
1431 | log_warnx( | |||
1432 | "%s: can't wait on unpause cond (%d)", | |||
1433 | __func__, (int)ret); | |||
1434 | break; | |||
1435 | } | |||
1436 | ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); | |||
1437 | if (ret) { | |||
1438 | log_warnx("%s: can't unlock unpause mtx (%d)", | |||
1439 | __func__, (int)ret); | |||
1440 | break; | |||
1441 | } | |||
1442 | } | |||
1443 | ||||
1444 | /* If we are halted and not paused, wait */ | |||
1445 | if (vcpu_hlt[n]) { | |||
1446 | ret = pthread_cond_wait(&vcpu_run_cond[n], | |||
1447 | &vcpu_run_mtx[n]); | |||
1448 | ||||
1449 | if (ret) { | |||
1450 | log_warnx( | |||
1451 | "%s: can't wait on cond (%d)", | |||
1452 | __func__, (int)ret); | |||
1453 | (void)pthread_mutex_unlock( | |||
1454 | &vcpu_run_mtx[n]); | |||
1455 | break; | |||
1456 | } | |||
1457 | } | |||
1458 | ||||
1459 | ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); | |||
1460 | ||||
1461 | if (ret) { | |||
1462 | log_warnx("%s: can't unlock mutex on cond (%d)", | |||
1463 | __func__, (int)ret); | |||
1464 | break; | |||
1465 | } | |||
1466 | ||||
1467 | if (vrp->vrp_irqready && i8259_is_pending()) { | |||
1468 | irq = i8259_ack(); | |||
1469 | vrp->vrp_irq = irq; | |||
1470 | } else | |||
1471 | vrp->vrp_irq = 0xFFFF; | |||
1472 | ||||
1473 | /* Still more pending? */ | |||
1474 | if (i8259_is_pending()) { | |||
1475 | /* | |||
1476 | * XXX can probably avoid ioctls here by providing intr | |||
1477 | * in vrp | |||
1478 | */ | |||
1479 | if (vcpu_pic_intr(vrp->vrp_vm_id, | |||
1480 | vrp->vrp_vcpu_id, 1)) { | |||
1481 | fatal("can't set INTR"); | |||
1482 | } | |||
1483 | } else { | |||
1484 | if (vcpu_pic_intr(vrp->vrp_vm_id, | |||
1485 | vrp->vrp_vcpu_id, 0)) { | |||
1486 | fatal("can't clear INTR"); | |||
1487 | } | |||
1488 | } | |||
1489 | ||||
1490 | if (ioctl(env->vmd_fd, VMM_IOC_RUN(((unsigned long)0x80000000|(unsigned long)0x40000000) | ((sizeof (struct vm_run_params) & 0x1fff) << 16) | ((('V')) << 8) | ((2))), vrp) == -1) { | |||
1491 | /* If run ioctl failed, exit */ | |||
1492 | ret = errno(*__errno()); | |||
1493 | log_warn("%s: vm %d / vcpu %d run ioctl failed", | |||
1494 | __func__, vrp->vrp_vm_id, n); | |||
1495 | break; | |||
1496 | } | |||
1497 | ||||
1498 | /* If the VM is terminating, exit normally */ | |||
1499 | if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED0xFFFE) { | |||
1500 | ret = (intptr_t)NULL((void*)0); | |||
1501 | break; | |||
1502 | } | |||
1503 | ||||
1504 | if (vrp->vrp_exit_reason != VM_EXIT_NONE0xFFFF) { | |||
1505 | /* | |||
1506 | * vmm(4) needs help handling an exit, handle in | |||
1507 | * vcpu_exit. | |||
1508 | */ | |||
1509 | ret = vcpu_exit(vrp); | |||
1510 | if (ret) | |||
1511 | break; | |||
1512 | } | |||
1513 | } | |||
1514 | ||||
1515 | mutex_lock(&threadmutex); | |||
1516 | vcpu_done[n] = 1; | |||
1517 | pthread_cond_signal(&threadcond); | |||
1518 | mutex_unlock(&threadmutex); | |||
1519 | ||||
1520 | return ((void *)ret); | |||
1521 | } | |||
1522 | ||||
1523 | int | |||
1524 | vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) | |||
1525 | { | |||
1526 | struct vm_intr_params vip; | |||
1527 | ||||
1528 | memset(&vip, 0, sizeof(vip)); | |||
1529 | ||||
1530 | vip.vip_vm_id = vm_id; | |||
1531 | vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ | |||
1532 | vip.vip_intr = intr; | |||
1533 | ||||
1534 | if (ioctl(env->vmd_fd, VMM_IOC_INTR((unsigned long)0x80000000 | ((sizeof(struct vm_intr_params) & 0x1fff) << 16) | ((('V')) << 8) | ((6))), &vip) == -1) | |||
1535 | return (errno(*__errno())); | |||
1536 | ||||
1537 | return (0); | |||
1538 | } | |||
1539 | ||||
1540 | /* | |||
1541 | * vcpu_exit_pci | |||
1542 | * | |||
1543 | * Handle all I/O to the emulated PCI subsystem. | |||
1544 | * | |||
1545 | * Parameters: | |||
1546 | * vrp: vcpu run paramters containing guest state for this exit | |||
1547 | * | |||
1548 | * Return value: | |||
1549 | * Interrupt to inject to the guest VM, or 0xFF if no interrupt should | |||
1550 | * be injected. | |||
1551 | */ | |||
1552 | uint8_t | |||
1553 | vcpu_exit_pci(struct vm_run_params *vrp) | |||
1554 | { | |||
1555 | struct vm_exit *vei = vrp->vrp_exit; | |||
1556 | uint8_t intr; | |||
1557 | ||||
1558 | intr = 0xFF; | |||
1559 | ||||
1560 | switch (vei->vei.vei_port) { | |||
1561 | case PCI_MODE1_ADDRESS_REG0x0cf8: | |||
1562 | pci_handle_address_reg(vrp); | |||
1563 | break; | |||
1564 | case PCI_MODE1_DATA_REG0x0cfc: | |||
1565 | case PCI_MODE1_DATA_REG0x0cfc + 1: | |||
1566 | case PCI_MODE1_DATA_REG0x0cfc + 2: | |||
1567 | case PCI_MODE1_DATA_REG0x0cfc + 3: | |||
1568 | pci_handle_data_reg(vrp); | |||
1569 | break; | |||
1570 | case VMM_PCI_IO_BAR_BASE0x1000 ... VMM_PCI_IO_BAR_END0xFFFF: | |||
1571 | intr = pci_handle_io(vrp); | |||
1572 | break; | |||
1573 | default: | |||
1574 | log_warnx("%s: unknown PCI register 0x%llx", | |||
1575 | __progname, (uint64_t)vei->vei.vei_port); | |||
1576 | break; | |||
1577 | } | |||
1578 | ||||
1579 | return (intr); | |||
1580 | } | |||
1581 | ||||
1582 | /* | |||
1583 | * vcpu_exit_inout | |||
1584 | * | |||
1585 | * Handle all I/O exits that need to be emulated in vmd. This includes the | |||
1586 | * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. | |||
1587 | * | |||
1588 | * Parameters: | |||
1589 | * vrp: vcpu run parameters containing guest state for this exit | |||
1590 | */ | |||
1591 | void | |||
1592 | vcpu_exit_inout(struct vm_run_params *vrp) | |||
1593 | { | |||
1594 | struct vm_exit *vei = vrp->vrp_exit; | |||
1595 | uint8_t intr = 0xFF; | |||
1596 | ||||
1597 | if (ioports_map[vei->vei.vei_port] != NULL((void*)0)) | |||
1598 | intr = ioports_map[vei->vei.vei_port](vrp); | |||
1599 | else if (vei->vei.vei_dir == VEI_DIR_IN) | |||
1600 | set_return_data(vei, 0xFFFFFFFF); | |||
1601 | ||||
1602 | if (intr != 0xFF) | |||
1603 | vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); | |||
1604 | } | |||
1605 | ||||
1606 | /* | |||
1607 | * vcpu_exit_eptviolation | |||
1608 | * | |||
1609 | * handle an EPT Violation | |||
1610 | * | |||
1611 | * Parameters: | |||
1612 | * vrp: vcpu run parameters containing guest state for this exit | |||
1613 | * | |||
1614 | * Return values: | |||
1615 | * 0: no action required | |||
1616 | * EAGAIN: a protection fault occured, kill the vm. | |||
1617 | */ | |||
1618 | int | |||
1619 | vcpu_exit_eptviolation(struct vm_run_params *vrp) | |||
1620 | { | |||
1621 | struct vm_exit *ve = vrp->vrp_exit; | |||
1622 | ||||
1623 | /* | |||
1624 | * vmd may be exiting to vmd to handle a pending interrupt | |||
1625 | * but last exit type may have been VMX_EXIT_EPT_VIOLATION, | |||
1626 | * check the fault_type to ensure we really are processing | |||
1627 | * a VMX_EXIT_EPT_VIOLATION. | |||
1628 | */ | |||
1629 | if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) { | |||
1630 | log_debug("%s: EPT Violation: rip=0x%llx", | |||
1631 | __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP16]); | |||
1632 | return (EAGAIN35); | |||
1633 | } | |||
1634 | ||||
1635 | return (0); | |||
1636 | } | |||
1637 | ||||
1638 | /* | |||
1639 | * vcpu_exit | |||
1640 | * | |||
1641 | * Handle a vcpu exit. This function is called when it is determined that | |||
1642 | * vmm(4) requires the assistance of vmd to support a particular guest | |||
1643 | * exit type (eg, accessing an I/O port or device). Guest state is contained | |||
1644 | * in 'vrp', and will be resent to vmm(4) on exit completion. | |||
1645 | * | |||
1646 | * Upon conclusion of handling the exit, the function determines if any | |||
1647 | * interrupts should be injected into the guest, and asserts the proper | |||
1648 | * IRQ line whose interrupt should be vectored. | |||
1649 | * | |||
1650 | * Parameters: | |||
1651 | * vrp: vcpu run parameters containing guest state for this exit | |||
1652 | * | |||
1653 | * Return values: | |||
1654 | * 0: the exit was handled successfully | |||
1655 | * 1: an error occurred (eg, unknown exit reason passed in 'vrp') | |||
1656 | */ | |||
1657 | int | |||
1658 | vcpu_exit(struct vm_run_params *vrp) | |||
1659 | { | |||
1660 | int ret; | |||
1661 | ||||
1662 | switch (vrp->vrp_exit_reason) { | |||
1663 | case VMX_EXIT_INT_WINDOW7: | |||
1664 | case SVM_VMEXIT_VINTR0x64: | |||
1665 | case VMX_EXIT_CPUID10: | |||
1666 | case VMX_EXIT_EXTINT1: | |||
1667 | case SVM_VMEXIT_INTR0x60: | |||
1668 | case SVM_VMEXIT_NPF0x400: | |||
1669 | case SVM_VMEXIT_MSR0x7C: | |||
1670 | case SVM_VMEXIT_CPUID0x72: | |||
1671 | /* | |||
1672 | * We may be exiting to vmd to handle a pending interrupt but | |||
1673 | * at the same time the last exit type may have been one of | |||
1674 | * these. In this case, there's nothing extra to be done | |||
1675 | * here (and falling through to the default case below results | |||
1676 | * in more vmd log spam). | |||
1677 | */ | |||
1678 | break; | |||
1679 | case VMX_EXIT_EPT_VIOLATION48: | |||
1680 | ret = vcpu_exit_eptviolation(vrp); | |||
1681 | if (ret) | |||
1682 | return (ret); | |||
1683 | ||||
1684 | break; | |||
1685 | case VMX_EXIT_IO30: | |||
1686 | case SVM_VMEXIT_IOIO0x7B: | |||
1687 | vcpu_exit_inout(vrp); | |||
1688 | break; | |||
1689 | case VMX_EXIT_HLT12: | |||
1690 | case SVM_VMEXIT_HLT0x78: | |||
1691 | ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); | |||
1692 | if (ret) { | |||
1693 | log_warnx("%s: can't lock vcpu mutex (%d)", | |||
1694 | __func__, ret); | |||
1695 | return (ret); | |||
1696 | } | |||
1697 | vcpu_hlt[vrp->vrp_vcpu_id] = 1; | |||
1698 | ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); | |||
1699 | if (ret) { | |||
1700 | log_warnx("%s: can't unlock vcpu mutex (%d)", | |||
1701 | __func__, ret); | |||
1702 | return (ret); | |||
1703 | } | |||
1704 | break; | |||
1705 | case VMX_EXIT_TRIPLE_FAULT2: | |||
1706 | case SVM_VMEXIT_SHUTDOWN0x7F: | |||
1707 | /* reset VM */ | |||
1708 | return (EAGAIN35); | |||
1709 | default: | |||
1710 | log_debug("%s: unknown exit reason 0x%x", | |||
1711 | __progname, vrp->vrp_exit_reason); | |||
1712 | } | |||
1713 | ||||
1714 | vrp->vrp_continue = 1; | |||
1715 | ||||
1716 | return (0); | |||
1717 | } | |||
1718 | ||||
1719 | /* | |||
1720 | * find_gpa_range | |||
1721 | * | |||
1722 | * Search for a contiguous guest physical mem range. | |||
1723 | * | |||
1724 | * Parameters: | |||
1725 | * vcp: VM create parameters that contain the memory map to search in | |||
1726 | * gpa: the starting guest physical address | |||
1727 | * len: the length of the memory range | |||
1728 | * | |||
1729 | * Return values: | |||
1730 | * NULL: on failure if there is no memory range as described by the parameters | |||
1731 | * Pointer to vm_mem_range that contains the start of the range otherwise. | |||
1732 | */ | |||
1733 | static struct vm_mem_range * | |||
1734 | find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) | |||
1735 | { | |||
1736 | size_t i, n; | |||
1737 | struct vm_mem_range *vmr; | |||
1738 | ||||
1739 | /* Find the first vm_mem_range that contains gpa */ | |||
1740 | for (i = 0; i < vcp->vcp_nmemranges; i++) { | |||
1741 | vmr = &vcp->vcp_memranges[i]; | |||
1742 | if (vmr->vmr_gpa + vmr->vmr_size >= gpa) | |||
1743 | break; | |||
1744 | } | |||
1745 | ||||
1746 | /* No range found. */ | |||
1747 | if (i == vcp->vcp_nmemranges) | |||
1748 | return (NULL((void*)0)); | |||
1749 | ||||
1750 | /* | |||
1751 | * vmr may cover the range [gpa, gpa + len) only partly. Make | |||
1752 | * sure that the following vm_mem_ranges are contiguous and | |||
1753 | * cover the rest. | |||
1754 | */ | |||
1755 | n = vmr->vmr_size - (gpa - vmr->vmr_gpa); | |||
1756 | if (len < n) | |||
1757 | len = 0; | |||
1758 | else | |||
1759 | len -= n; | |||
1760 | gpa = vmr->vmr_gpa + vmr->vmr_size; | |||
1761 | for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { | |||
1762 | vmr = &vcp->vcp_memranges[i]; | |||
1763 | if (gpa != vmr->vmr_gpa) | |||
1764 | return (NULL((void*)0)); | |||
1765 | if (len <= vmr->vmr_size) | |||
1766 | len = 0; | |||
1767 | else | |||
1768 | len -= vmr->vmr_size; | |||
1769 | ||||
1770 | gpa = vmr->vmr_gpa + vmr->vmr_size; | |||
1771 | } | |||
1772 | ||||
1773 | if (len != 0) | |||
1774 | return (NULL((void*)0)); | |||
1775 | ||||
1776 | return (vmr); | |||
1777 | } | |||
1778 | ||||
1779 | /* | |||
1780 | * write_mem | |||
1781 | * | |||
1782 | * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. | |||
1783 | * | |||
1784 | * Parameters: | |||
1785 | * dst: the destination paddr_t in the guest VM | |||
1786 | * buf: data to copy (or NULL to zero the data) | |||
1787 | * len: number of bytes to copy | |||
1788 | * | |||
1789 | * Return values: | |||
1790 | * 0: success | |||
1791 | * EINVAL: if the guest physical memory range [dst, dst + len) does not | |||
1792 | * exist in the guest. | |||
1793 | */ | |||
1794 | int | |||
1795 | write_mem(paddr_t dst, const void *buf, size_t len) | |||
1796 | { | |||
1797 | const char *from = buf; | |||
1798 | char *to; | |||
1799 | size_t n, off; | |||
1800 | struct vm_mem_range *vmr; | |||
1801 | ||||
1802 | vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); | |||
1803 | if (vmr == NULL((void*)0)) { | |||
1804 | errno(*__errno()) = EINVAL22; | |||
1805 | log_warn("%s: failed - invalid memory range dst = 0x%lx, " | |||
1806 | "len = 0x%zx", __func__, dst, len); | |||
1807 | return (EINVAL22); | |||
1808 | } | |||
1809 | ||||
1810 | off = dst - vmr->vmr_gpa; | |||
1811 | while (len != 0) { | |||
1812 | n = vmr->vmr_size - off; | |||
1813 | if (len < n) | |||
1814 | n = len; | |||
1815 | ||||
1816 | to = (char *)vmr->vmr_va + off; | |||
1817 | if (buf == NULL((void*)0)) | |||
1818 | memset(to, 0, n); | |||
1819 | else { | |||
1820 | memcpy(to, from, n); | |||
1821 | from += n; | |||
1822 | } | |||
1823 | len -= n; | |||
1824 | off = 0; | |||
1825 | vmr++; | |||
1826 | } | |||
1827 | ||||
1828 | return (0); | |||
1829 | } | |||
1830 | ||||
1831 | /* | |||
1832 | * read_mem | |||
1833 | * | |||
1834 | * Reads memory at guest paddr 'src' into 'buf'. | |||
1835 | * | |||
1836 | * Parameters: | |||
1837 | * src: the source paddr_t in the guest VM to read from. | |||
1838 | * buf: destination (local) buffer | |||
1839 | * len: number of bytes to read | |||
1840 | * | |||
1841 | * Return values: | |||
1842 | * 0: success | |||
1843 | * EINVAL: if the guest physical memory range [dst, dst + len) does not | |||
1844 | * exist in the guest. | |||
1845 | */ | |||
1846 | int | |||
1847 | read_mem(paddr_t src, void *buf, size_t len) | |||
1848 | { | |||
1849 | char *from, *to = buf; | |||
1850 | size_t n, off; | |||
1851 | struct vm_mem_range *vmr; | |||
1852 | ||||
1853 | vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); | |||
1854 | if (vmr == NULL((void*)0)) { | |||
1855 | errno(*__errno()) = EINVAL22; | |||
1856 | log_warn("%s: failed - invalid memory range src = 0x%lx, " | |||
1857 | "len = 0x%zx", __func__, src, len); | |||
1858 | return (EINVAL22); | |||
1859 | } | |||
1860 | ||||
1861 | off = src - vmr->vmr_gpa; | |||
1862 | while (len != 0) { | |||
1863 | n = vmr->vmr_size - off; | |||
1864 | if (len < n) | |||
1865 | n = len; | |||
1866 | ||||
1867 | from = (char *)vmr->vmr_va + off; | |||
1868 | memcpy(to, from, n); | |||
1869 | ||||
1870 | to += n; | |||
1871 | len -= n; | |||
1872 | off = 0; | |||
1873 | vmr++; | |||
1874 | } | |||
1875 | ||||
1876 | return (0); | |||
1877 | } | |||
1878 | ||||
1879 | /* | |||
1880 | * vcpu_assert_pic_irq | |||
1881 | * | |||
1882 | * Injects the specified IRQ on the supplied vcpu/vm | |||
1883 | * | |||
1884 | * Parameters: | |||
1885 | * vm_id: VM ID to inject to | |||
1886 | * vcpu_id: VCPU ID to inject to | |||
1887 | * irq: IRQ to inject | |||
1888 | */ | |||
1889 | void | |||
1890 | vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) | |||
1891 | { | |||
1892 | int ret; | |||
1893 | ||||
1894 | i8259_assert_irq(irq); | |||
1895 | ||||
1896 | if (i8259_is_pending()) { | |||
1897 | if (vcpu_pic_intr(vm_id, vcpu_id, 1)) | |||
1898 | fatalx("%s: can't assert INTR", __func__); | |||
1899 | ||||
1900 | ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); | |||
1901 | if (ret) | |||
1902 | fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); | |||
1903 | ||||
1904 | vcpu_hlt[vcpu_id] = 0; | |||
1905 | ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); | |||
1906 | if (ret) | |||
1907 | fatalx("%s: can't signal (%d)", __func__, ret); | |||
1908 | ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); | |||
1909 | if (ret) | |||
1910 | fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); | |||
1911 | } | |||
1912 | } | |||
1913 | ||||
1914 | /* | |||
1915 | * vcpu_deassert_pic_irq | |||
1916 | * | |||
1917 | * Clears the specified IRQ on the supplied vcpu/vm | |||
1918 | * | |||
1919 | * Parameters: | |||
1920 | * vm_id: VM ID to clear in | |||
1921 | * vcpu_id: VCPU ID to clear in | |||
1922 | * irq: IRQ to clear | |||
1923 | */ | |||
1924 | void | |||
1925 | vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) | |||
1926 | { | |||
1927 | i8259_deassert_irq(irq); | |||
1928 | ||||
1929 | if (!i8259_is_pending()) { | |||
1930 | if (vcpu_pic_intr(vm_id, vcpu_id, 0)) | |||
1931 | fatalx("%s: can't deassert INTR for vm_id %d, " | |||
1932 | "vcpu_id %d", __func__, vm_id, vcpu_id); | |||
1933 | } | |||
1934 | } | |||
1935 | ||||
1936 | /* | |||
1937 | * fd_hasdata | |||
1938 | * | |||
1939 | * Determines if data can be read from a file descriptor. | |||
1940 | * | |||
1941 | * Parameters: | |||
1942 | * fd: the fd to check | |||
1943 | * | |||
1944 | * Return values: | |||
1945 | * 1 if data can be read from an fd, or 0 otherwise. | |||
1946 | */ | |||
1947 | int | |||
1948 | fd_hasdata(int fd) | |||
1949 | { | |||
1950 | struct pollfd pfd[1]; | |||
1951 | int nready, hasdata = 0; | |||
1952 | ||||
1953 | pfd[0].fd = fd; | |||
1954 | pfd[0].events = POLLIN0x0001; | |||
1955 | nready = poll(pfd, 1, 0); | |||
1956 | if (nready == -1) | |||
1957 | log_warn("checking file descriptor for data failed"); | |||
1958 | else if (nready == 1 && pfd[0].revents & POLLIN0x0001) | |||
1959 | hasdata = 1; | |||
1960 | return (hasdata); | |||
1961 | } | |||
1962 | ||||
1963 | /* | |||
1964 | * mutex_lock | |||
1965 | * | |||
1966 | * Wrapper function for pthread_mutex_lock that does error checking and that | |||
1967 | * exits on failure | |||
1968 | */ | |||
1969 | void | |||
1970 | mutex_lock(pthread_mutex_t *m) | |||
1971 | { | |||
1972 | int ret; | |||
1973 | ||||
1974 | ret = pthread_mutex_lock(m); | |||
1975 | if (ret) { | |||
1976 | errno(*__errno()) = ret; | |||
1977 | fatal("could not acquire mutex"); | |||
1978 | } | |||
1979 | } | |||
1980 | ||||
1981 | /* | |||
1982 | * mutex_unlock | |||
1983 | * | |||
1984 | * Wrapper function for pthread_mutex_unlock that does error checking and that | |||
1985 | * exits on failure | |||
1986 | */ | |||
1987 | void | |||
1988 | mutex_unlock(pthread_mutex_t *m) | |||
1989 | { | |||
1990 | int ret; | |||
1991 | ||||
1992 | ret = pthread_mutex_unlock(m); | |||
1993 | if (ret) { | |||
1994 | errno(*__errno()) = ret; | |||
1995 | fatal("could not release mutex"); | |||
1996 | } | |||
1997 | } | |||
1998 | ||||
1999 | /* | |||
2000 | * set_return_data | |||
2001 | * | |||
2002 | * Utility function for manipulating register data in vm exit info structs. This | |||
2003 | * function ensures that the data is copied to the vei->vei.vei_data field with | |||
2004 | * the proper size for the operation being performed. | |||
2005 | * | |||
2006 | * Parameters: | |||
2007 | * vei: exit information | |||
2008 | * data: return data | |||
2009 | */ | |||
2010 | void | |||
2011 | set_return_data(struct vm_exit *vei, uint32_t data) | |||
2012 | { | |||
2013 | switch (vei->vei.vei_size) { | |||
2014 | case 1: | |||
2015 | vei->vei.vei_data &= ~0xFF; | |||
2016 | vei->vei.vei_data |= (uint8_t)data; | |||
2017 | break; | |||
2018 | case 2: | |||
2019 | vei->vei.vei_data &= ~0xFFFF; | |||
2020 | vei->vei.vei_data |= (uint16_t)data; | |||
2021 | break; | |||
2022 | case 4: | |||
2023 | vei->vei.vei_data = data; | |||
2024 | break; | |||
2025 | } | |||
2026 | } | |||
2027 | ||||
2028 | /* | |||
2029 | * get_input_data | |||
2030 | * | |||
2031 | * Utility function for manipulating register data in vm exit info | |||
2032 | * structs. This function ensures that the data is copied from the | |||
2033 | * vei->vei.vei_data field with the proper size for the operation being | |||
2034 | * performed. | |||
2035 | * | |||
2036 | * Parameters: | |||
2037 | * vei: exit information | |||
2038 | * data: location to store the result | |||
2039 | */ | |||
2040 | void | |||
2041 | get_input_data(struct vm_exit *vei, uint32_t *data) | |||
2042 | { | |||
2043 | switch (vei->vei.vei_size) { | |||
2044 | case 1: | |||
2045 | *data &= 0xFFFFFF00; | |||
2046 | *data |= (uint8_t)vei->vei.vei_data; | |||
2047 | break; | |||
2048 | case 2: | |||
2049 | *data &= 0xFFFF0000; | |||
2050 | *data |= (uint16_t)vei->vei.vei_data; | |||
2051 | break; | |||
2052 | case 4: | |||
2053 | *data = vei->vei.vei_data; | |||
2054 | break; | |||
2055 | default: | |||
2056 | log_warnx("%s: invalid i/o size %d", __func__, | |||
2057 | vei->vei.vei_size); | |||
2058 | } | |||
2059 | ||||
2060 | } | |||
2061 | ||||
2062 | /* | |||
2063 | * translate_gva | |||
2064 | * | |||
2065 | * Translates a guest virtual address to a guest physical address by walking | |||
2066 | * the currently active page table (if needed). | |||
2067 | * | |||
2068 | * Note - this function can possibly alter the supplied VCPU state. | |||
2069 | * Specifically, it may inject exceptions depending on the current VCPU | |||
2070 | * configuration, and may alter %cr2 on #PF. Consequently, this function | |||
2071 | * should only be used as part of instruction emulation. | |||
2072 | * | |||
2073 | * Parameters: | |||
2074 | * exit: The VCPU this translation should be performed for (guest MMU settings | |||
2075 | * are gathered from this VCPU) | |||
2076 | * va: virtual address to translate | |||
2077 | * pa: pointer to paddr_t variable that will receive the translated physical | |||
2078 | * address. 'pa' is unchanged on error. | |||
2079 | * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which | |||
2080 | * the address should be translated | |||
2081 | * | |||
2082 | * Return values: | |||
2083 | * 0: the address was successfully translated - 'pa' contains the physical | |||
2084 | * address currently mapped by 'va'. | |||
2085 | * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case | |||
2086 | * and %cr2 set in the vcpu structure. | |||
2087 | * EINVAL: an error occurred reading paging table structures | |||
2088 | */ | |||
2089 | int | |||
2090 | translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) | |||
2091 | { | |||
2092 | int level, shift, pdidx; | |||
2093 | uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; | |||
2094 | uint64_t shift_width, pte_size; | |||
2095 | struct vcpu_reg_state *vrs; | |||
2096 | ||||
2097 | vrs = &exit->vrs; | |||
2098 | ||||
2099 | if (!pa) | |||
2100 | return (EINVAL22); | |||
2101 | ||||
2102 | if (!(vrs->vrs_crs[VCPU_REGS_CR00] & CR0_PG0x80000000)) { | |||
2103 | log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); | |||
2104 | *pa = va; | |||
2105 | return (0); | |||
2106 | } | |||
2107 | ||||
2108 | pt_paddr = vrs->vrs_crs[VCPU_REGS_CR32]; | |||
2109 | ||||
2110 | log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, | |||
2111 | vrs->vrs_crs[VCPU_REGS_CR00], vrs->vrs_crs[VCPU_REGS_CR32]); | |||
2112 | ||||
2113 | if (vrs->vrs_crs[VCPU_REGS_CR00] & CR0_PE0x00000001) { | |||
2114 | if (vrs->vrs_crs[VCPU_REGS_CR43] & CR4_PAE0x00000020) { | |||
2115 | pte_size = sizeof(uint64_t); | |||
2116 | shift_width = 9; | |||
2117 | ||||
2118 | if (vrs->vrs_msrs[VCPU_REGS_EFER0] & EFER_LMA0x00000400) { | |||
2119 | /* 4 level paging */ | |||
2120 | level = 4; | |||
2121 | mask = L4_MASK0x0000ff8000000000UL; | |||
2122 | shift = L4_SHIFT39; | |||
2123 | } else { | |||
2124 | /* 32 bit with PAE paging */ | |||
2125 | level = 3; | |||
2126 | mask = L3_MASK0x0000007fc0000000UL; | |||
2127 | shift = L3_SHIFT30; | |||
2128 | } | |||
2129 | } else { | |||
2130 | /* 32 bit paging */ | |||
2131 | level = 2; | |||
2132 | shift_width = 10; | |||
2133 | mask = 0xFFC00000; | |||
2134 | shift = 22; | |||
2135 | pte_size = sizeof(uint32_t); | |||
2136 | } | |||
2137 | } else | |||
2138 | return (EINVAL22); | |||
2139 | ||||
2140 | /* XXX: Check for R bit in segment selector and set A bit */ | |||
2141 | ||||
2142 | for (;level > 0; level--) { | |||
2143 | pdidx = (va & mask) >> shift; | |||
2144 | pte_paddr = (pt_paddr) + (pdidx * pte_size); | |||
2145 | ||||
2146 | log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, | |||
2147 | level, pte_paddr); | |||
2148 | if (read_mem(pte_paddr, &pte, pte_size)) { | |||
2149 | log_warn("%s: failed to read pte", __func__); | |||
2150 | return (EFAULT14); | |||
2151 | } | |||
2152 | ||||
2153 | log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, | |||
2154 | pte); | |||
2155 | ||||
2156 | /* XXX: Set CR2 */ | |||
2157 | if (!(pte & PG_V0x0000000000000001UL)) | |||
2158 | return (EFAULT14); | |||
2159 | ||||
2160 | /* XXX: Check for SMAP */ | |||
2161 | if ((mode == PROT_WRITE0x02) && !(pte & PG_RW0x0000000000000002UL)) | |||
2162 | return (EPERM1); | |||
2163 | ||||
2164 | if ((exit->cpl > 0) && !(pte & PG_u0x0000000000000004UL)) | |||
2165 | return (EPERM1); | |||
2166 | ||||
2167 | pte = pte | PG_U0x0000000000000020UL; | |||
2168 | if (mode == PROT_WRITE0x02) | |||
2169 | pte = pte | PG_M0x0000000000000040UL; | |||
2170 | if (write_mem(pte_paddr, &pte, pte_size)) { | |||
2171 | log_warn("%s: failed to write back flags to pte", | |||
2172 | __func__); | |||
2173 | return (EIO5); | |||
2174 | } | |||
2175 | ||||
2176 | /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ | |||
2177 | if (pte & PG_PS0x0000000000000080UL) | |||
2178 | break; | |||
2179 | ||||
2180 | if (level > 1) { | |||
2181 | pt_paddr = pte & PG_FRAME0x000ffffffffff000UL; | |||
2182 | shift -= shift_width; | |||
2183 | mask = mask >> shift_width; | |||
2184 | } | |||
2185 | } | |||
2186 | ||||
2187 | low_mask = (1 << shift) - 1; | |||
2188 | high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; | |||
2189 | *pa = (pte & high_mask) | (va & low_mask); | |||
2190 | ||||
2191 | log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); | |||
2192 | ||||
2193 | return (0); | |||
2194 | } | |||
2195 | ||||
2196 | /* | |||
2197 | * vm_pipe_init | |||
2198 | * | |||
2199 | * Initialize a vm_dev_pipe, setting up its file descriptors and its | |||
2200 | * event structure with the given callback. | |||
2201 | * | |||
2202 | * Parameters: | |||
2203 | * p: pointer to vm_dev_pipe struct to initizlize | |||
2204 | * cb: callback to use for READ events on the read end of the pipe | |||
2205 | */ | |||
2206 | void | |||
2207 | vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) | |||
2208 | { | |||
2209 | int ret; | |||
2210 | int fds[2]; | |||
2211 | ||||
2212 | memset(p, 0, sizeof(struct vm_dev_pipe)); | |||
2213 | ||||
2214 | ret = pipe(fds); | |||
2215 | if (ret) | |||
2216 | fatal("failed to create vm_dev_pipe pipe"); | |||
2217 | ||||
2218 | p->read = fds[0]; | |||
2219 | p->write = fds[1]; | |||
2220 | ||||
2221 | event_set(&p->read_ev, p->read, EV_READ0x02 | EV_PERSIST0x10, cb, NULL((void*)0)); | |||
2222 | } | |||
2223 | ||||
2224 | /* | |||
2225 | * vm_pipe_send | |||
2226 | * | |||
2227 | * Send a message to an emulated device vie the provided vm_dev_pipe. | |||
2228 | * | |||
2229 | * Parameters: | |||
2230 | * p: pointer to initialized vm_dev_pipe | |||
2231 | * msg: message to send in the channel | |||
2232 | */ | |||
2233 | void | |||
2234 | vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) | |||
2235 | { | |||
2236 | size_t n; | |||
2237 | n = write(p->write, &msg, sizeof(msg)); | |||
2238 | if (n != sizeof(msg)) | |||
2239 | fatal("failed to write to device pipe"); | |||
2240 | } | |||
2241 | ||||
2242 | /* | |||
2243 | * vm_pipe_recv | |||
2244 | * | |||
2245 | * Receive a message for an emulated device via the provided vm_dev_pipe. | |||
2246 | * Returns the message value, otherwise will exit on failure. | |||
2247 | * | |||
2248 | * Parameters: | |||
2249 | * p: pointer to initialized vm_dev_pipe | |||
2250 | * | |||
2251 | * Return values: | |||
2252 | * a value of enum pipe_msg_type or fatal exit on read(2) error | |||
2253 | */ | |||
2254 | enum pipe_msg_type | |||
2255 | vm_pipe_recv(struct vm_dev_pipe *p) | |||
2256 | { | |||
2257 | size_t n; | |||
2258 | enum pipe_msg_type msg; | |||
2259 | n = read(p->read, &msg, sizeof(msg)); | |||
2260 | if (n != sizeof(msg)) | |||
2261 | fatal("failed to read from device pipe"); | |||
2262 | ||||
2263 | return msg; | |||
2264 | } |