Bug Summary

File:arch/amd64/amd64/pmap.c
Warning:line 2087, column 3
Value stored to 'level' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.4 -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name pmap.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model static -mframe-pointer=all -relaxed-aliasing -ffp-contract=on -fno-rounding-math -mconstructor-aliases -ffreestanding -mcmodel=kernel -target-cpu x86-64 -target-feature +retpoline-indirect-calls -target-feature +retpoline-indirect-branches -target-feature -sse2 -target-feature -sse -target-feature -3dnow -target-feature -mmx -target-feature +save-args -target-feature +retpoline-external-thunk -disable-red-zone -no-implicit-float -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/sys/arch/amd64/compile/GENERIC.MP/obj -nostdsysteminc -nobuiltininc -resource-dir /usr/local/llvm16/lib/clang/16 -I /usr/src/sys -I /usr/src/sys/arch/amd64/compile/GENERIC.MP/obj -I /usr/src/sys/arch -I /usr/src/sys/dev/pci/drm/include -I /usr/src/sys/dev/pci/drm/include/uapi -I /usr/src/sys/dev/pci/drm/amd/include/asic_reg -I /usr/src/sys/dev/pci/drm/amd/include -I /usr/src/sys/dev/pci/drm/amd/amdgpu -I /usr/src/sys/dev/pci/drm/amd/display -I /usr/src/sys/dev/pci/drm/amd/display/include -I /usr/src/sys/dev/pci/drm/amd/display/dc -I /usr/src/sys/dev/pci/drm/amd/display/amdgpu_dm -I /usr/src/sys/dev/pci/drm/amd/pm/inc -I /usr/src/sys/dev/pci/drm/amd/pm/legacy-dpm -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/inc -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/smu11 -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/smu12 -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/smu13 -I /usr/src/sys/dev/pci/drm/amd/pm/powerplay/inc -I /usr/src/sys/dev/pci/drm/amd/pm/powerplay/hwmgr -I /usr/src/sys/dev/pci/drm/amd/pm/powerplay/smumgr -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/inc -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/inc/pmfw_if -I /usr/src/sys/dev/pci/drm/amd/display/dc/inc -I /usr/src/sys/dev/pci/drm/amd/display/dc/inc/hw -I /usr/src/sys/dev/pci/drm/amd/display/dc/clk_mgr -I /usr/src/sys/dev/pci/drm/amd/display/modules/inc -I /usr/src/sys/dev/pci/drm/amd/display/modules/hdcp -I /usr/src/sys/dev/pci/drm/amd/display/dmub/inc -I /usr/src/sys/dev/pci/drm/i915 -D DDB -D DIAGNOSTIC -D KTRACE -D ACCOUNTING -D KMEMSTATS -D PTRACE -D POOL_DEBUG -D CRYPTO -D SYSVMSG -D SYSVSEM -D SYSVSHM -D UVM_SWAP_ENCRYPT -D FFS -D FFS2 -D FFS_SOFTUPDATES -D UFS_DIRHASH -D QUOTA -D EXT2FS -D MFS -D NFSCLIENT -D NFSSERVER -D CD9660 -D UDF -D MSDOSFS -D FIFO -D FUSE -D SOCKET_SPLICE -D TCP_ECN -D TCP_SIGNATURE -D INET6 -D IPSEC -D PPP_BSDCOMP -D PPP_DEFLATE -D PIPEX -D MROUTING -D MPLS -D BOOT_CONFIG -D USER_PCICONF -D APERTURE -D MTRR -D NTFS -D SUSPEND -D HIBERNATE -D PCIVERBOSE -D USBVERBOSE -D WSDISPLAY_COMPAT_USL -D WSDISPLAY_COMPAT_RAWKBD -D WSDISPLAY_DEFAULTSCREENS=6 -D X86EMU -D ONEWIREVERBOSE -D MULTIPROCESSOR -D MAXUSERS=80 -D _KERNEL -O2 -Wno-pointer-sign -Wno-address-of-packed-member -Wno-constant-conversion -Wno-unused-but-set-variable -Wno-gnu-folding-constant -fdebug-compilation-dir=/usr/src/sys/arch/amd64/compile/GENERIC.MP/obj -ferror-limit 19 -fwrapv -D_RET_PROTECTOR -ret-protector -fcf-protection=branch -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -o /home/ben/Projects/scan/2024-01-11-110808-61670-1 -x c /usr/src/sys/arch/amd64/amd64/pmap.c
1/* $OpenBSD: pmap.c,v 1.165 2023/12/29 13:23:27 jca Exp $ */
2/* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */
3
4/*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Copyright 2001 (c) Wasabi Systems, Inc.
31 * All rights reserved.
32 *
33 * Written by Frank van der Linden for Wasabi Systems, Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgement:
45 * This product includes software developed for the NetBSD Project by
46 * Wasabi Systems, Inc.
47 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
48 * or promote products derived from this software without specific prior
49 * written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
53 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
54 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
55 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
56 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
57 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
58 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
59 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
60 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
61 * POSSIBILITY OF SUCH DAMAGE.
62 */
63
64/*
65 * This is the i386 pmap modified and generalized to support x86-64
66 * as well. The idea is to hide the upper N levels of the page tables
67 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
68 * is mostly untouched, except that it uses some more generalized
69 * macros and interfaces.
70 *
71 * This pmap has been tested on the i386 as well, and it can be easily
72 * adapted to PAE.
73 *
74 * fvdl@wasabisystems.com 18-Jun-2001
75 */
76
77/*
78 * pmap.c: i386 pmap module rewrite
79 * Chuck Cranor <chuck@ccrc.wustl.edu>
80 * 11-Aug-97
81 *
82 * history of this pmap module: in addition to my own input, i used
83 * the following references for this rewrite of the i386 pmap:
84 *
85 * [1] the NetBSD i386 pmap. this pmap appears to be based on the
86 * BSD hp300 pmap done by Mike Hibler at University of Utah.
87 * it was then ported to the i386 by William Jolitz of UUNET
88 * Technologies, Inc. Then Charles M. Hannum of the NetBSD
89 * project fixed some bugs and provided some speed ups.
90 *
91 * [2] the FreeBSD i386 pmap. this pmap seems to be the
92 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
93 * and David Greenman.
94 *
95 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
96 * between several processors. the VAX version was done by
97 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
98 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
99 * David Golub, and Richard Draves. the alpha version was
100 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
101 * (NetBSD/alpha).
102 */
103
104#include <sys/param.h>
105#include <sys/systm.h>
106#include <sys/atomic.h>
107#include <sys/proc.h>
108#include <sys/pool.h>
109#include <sys/user.h>
110#include <sys/mutex.h>
111
112#include <uvm/uvm.h>
113
114#include <machine/cpu.h>
115#ifdef MULTIPROCESSOR1
116#include <machine/i82489reg.h>
117#include <machine/i82489var.h>
118#endif
119
120#include "vmm.h"
121
122#if NVMM1 > 0
123#include <machine/vmmvar.h>
124#endif /* NVMM > 0 */
125
126#include "acpi.h"
127
128/* #define PMAP_DEBUG */
129
130#ifdef PMAP_DEBUG
131#define DPRINTF(x...) do { printf(x); } while(0)
132#else
133#define DPRINTF(x...)
134#endif /* PMAP_DEBUG */
135
136
137/*
138 * general info:
139 *
140 * - for an explanation of how the i386 MMU hardware works see
141 * the comments in <machine/pte.h>.
142 *
143 * - for an explanation of the general memory structure used by
144 * this pmap (including the recursive mapping), see the comments
145 * in <machine/pmap.h>.
146 *
147 * this file contains the code for the "pmap module." the module's
148 * job is to manage the hardware's virtual to physical address mappings.
149 * note that there are two levels of mapping in the VM system:
150 *
151 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
152 * to map ranges of virtual address space to objects/files. for
153 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
154 * to the file /bin/ls starting at offset zero." note that
155 * the upper layer mapping is not concerned with how individual
156 * vm_pages are mapped.
157 *
158 * [2] the lower layer of the VM system (the pmap) maintains the mappings
159 * from virtual addresses. it is concerned with which vm_page is
160 * mapped where. for example, when you run /bin/ls and start
161 * at page 0x1000 the fault routine may lookup the correct page
162 * of the /bin/ls file and then ask the pmap layer to establish
163 * a mapping for it.
164 *
165 * note that information in the lower layer of the VM system can be
166 * thrown away since it can easily be reconstructed from the info
167 * in the upper layer.
168 *
169 * data structures we use include:
170 * - struct pmap: describes the address space of one process
171 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
172 * - struct pg_to_free: a list of virtual addresses whose mappings
173 * have been changed. used for TLB flushing.
174 */
175
176/*
177 * memory allocation
178 *
179 * - there are three data structures that we must dynamically allocate:
180 *
181 * [A] new process' page directory page (PDP)
182 * - plan 1: done at pmap_create() we use
183 * pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation.
184 *
185 * if we are low in free physical memory then we sleep in
186 * pool_get() -- in this case this is ok since we are creating
187 * a new pmap and should not be holding any locks.
188 *
189 * XXX: the fork code currently has no way to return an "out of
190 * memory, try again" error code since uvm_fork [fka vm_fork]
191 * is a void function.
192 *
193 * [B] new page tables pages (PTP)
194 * call uvm_pagealloc()
195 * => success: zero page, add to pm_pdir
196 * => failure: we are out of free vm_pages, let pmap_enter()
197 * tell UVM about it.
198 *
199 * note: for kernel PTPs, we start with NKPTP of them. as we map
200 * kernel memory (at uvm_map time) we check to see if we've grown
201 * the kernel pmap. if so, we call the optional function
202 * pmap_growkernel() to grow the kernel PTPs in advance.
203 *
204 * [C] pv_entry structures
205 * - try to allocate one from the pool.
206 * If we fail, we simply let pmap_enter() tell UVM about it.
207 */
208
209long nkptp[] = NKPTP_INITIALIZER{ 0, 0, 0, 0 };
210
211const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER{ (((0x0000ff8000000000UL|0x0000007fc0000000UL)|0x000000003fe00000UL
)|0x00000000001ff000UL), ((0x0000ff8000000000UL|0x0000007fc0000000UL
)|0x000000003fe00000UL), (0x0000ff8000000000UL|0x0000007fc0000000UL
), 0x0000ff8000000000UL }
;
212const int ptp_shifts[] = PTP_SHIFT_INITIALIZER{ 12, 21, 30, 39 };
213const long nkptpmax[] = NKPTPMAX_INITIALIZER{ (unsigned long)((unsigned long)((unsigned long)((unsigned long
)1 * 512) * 512) * 512), (unsigned long)((unsigned long)((unsigned
long)1 * 512) * 512), (unsigned long)((unsigned long)1 * 512
), (unsigned long)1 }
;
214const long nbpd[] = NBPD_INITIALIZER{ (1ULL << 12), (1ULL << 21), (1ULL << 30),
(1ULL << 39) }
;
215pd_entry_t *const normal_pdes[] = PDES_INITIALIZER{ ((pd_entry_t *)((char *)((pt_entry_t *) (255 * (1ULL <<
39))) + 255 * (1ULL << 30))), ((pd_entry_t *)((char *)
((pd_entry_t *)((char *)((pt_entry_t *) (255 * (1ULL <<
39))) + 255 * (1ULL << 30))) + 255 * (1ULL << 21
))), ((pd_entry_t *)((char *)((pd_entry_t *)((char *)((pd_entry_t
*)((char *)((pt_entry_t *) (255 * (1ULL << 39))) + 255
* (1ULL << 30))) + 255 * (1ULL << 21))) + 255 * (
1ULL << 12))) }
;
216
217#define pmap_pte_set(p, n)_atomic_swap_64((p), (n)) atomic_swap_64(p, n)_atomic_swap_64((p), (n))
218#define pmap_pte_clearbits(p, b)x86_atomic_clearbits_u64(p, b) x86_atomic_clearbits_u64(p, b)
219#define pmap_pte_setbits(p, b)x86_atomic_setbits_u64(p, b) x86_atomic_setbits_u64(p, b)
220
221/*
222 * global data structures
223 */
224
225struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
226
227/*
228 * pg_nx: NX PTE bit (if CPU supports)
229 * pg_g_kern: PG_G if global pages should be used in kernel mappings,
230 * 0 otherwise (for insecure CPUs)
231 */
232pt_entry_t pg_nx = 0;
233pt_entry_t pg_g_kern = 0;
234
235/* pg_xo: XO PTE bits, set to PKU key1 (if cpu supports PKU) */
236pt_entry_t pg_xo;
237
238/*
239 * pmap_pg_wc: if our processor supports PAT then we set this
240 * to be the pte bits for Write Combining. Else we fall back to
241 * UC- so mtrrs can override the cacheability;
242 */
243int pmap_pg_wc = PG_UCMINUS(0x0000000000000010UL);
244
245/*
246 * pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID)
247 *
248 * The next three are zero unless and until PCID support is enabled so code
249 * can just 'or' them in as needed without tests.
250 * cr3_pcid: CR3_REUSE_PCID
251 * cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP
252 */
253#if PCID_KERN0 != 0
254# error "pmap.c assumes PCID_KERN is zero"
255#endif
256int pmap_use_pcid;
257static u_int cr3_pcid_proc;
258static u_int cr3_pcid_temp;
259/* these two are accessed from locore.o */
260paddr_t cr3_reuse_pcid;
261paddr_t cr3_pcid_proc_intel;
262
263/*
264 * other data structures
265 */
266
267pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */
268int pmap_initialized = 0; /* pmap_init done yet? */
269
270/*
271 * pv management structures.
272 */
273struct pool pmap_pv_pool;
274
275/*
276 * linked list of all non-kernel pmaps
277 */
278
279struct pmap_head pmaps;
280struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM){ ((void *)0), ((((0xa)) > 0x0 && ((0xa)) < 0x9
) ? 0x9 : ((0xa))), 0x0 }
;
281
282/*
283 * pool that pmap structures are allocated from
284 */
285
286struct pool pmap_pmap_pool;
287
288/*
289 * When we're freeing a ptp, we need to delay the freeing until all
290 * tlb shootdown has been done. This is the list of the to-be-freed pages.
291 */
292TAILQ_HEAD(pg_to_free, vm_page)struct pg_to_free { struct vm_page *tqh_first; struct vm_page
**tqh_last; }
;
293
294/*
295 * pool that PDPs are allocated from
296 */
297
298struct pool pmap_pdp_pool;
299void pmap_pdp_ctor(pd_entry_t *);
300void pmap_pdp_ctor_intel(pd_entry_t *);
301
302extern vaddr_t msgbuf_vaddr;
303extern paddr_t msgbuf_paddr;
304
305extern vaddr_t idt_vaddr; /* we allocate IDT early */
306extern paddr_t idt_paddr;
307
308extern vaddr_t lo32_vaddr;
309extern vaddr_t lo32_paddr;
310
311vaddr_t virtual_avail;
312extern int end;
313
314/*
315 * local prototypes
316 */
317
318void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
319 vaddr_t, struct vm_page *);
320struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t);
321struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
322int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs);
323void pmap_free_ptp(struct pmap *, struct vm_page *,
324 vaddr_t, struct pg_to_free *);
325void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
326#ifdef MULTIPROCESSOR1
327static int pmap_is_active(struct pmap *, struct cpu_info *);
328#endif
329paddr_t pmap_map_ptes(struct pmap *);
330struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
331void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
332void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
333void pmap_do_remove_ept(struct pmap *, vaddr_t);
334int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
335int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
336 vaddr_t, int, struct pv_entry **);
337void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
338 vaddr_t, vaddr_t, int, struct pv_entry **);
339#define PMAP_REMOVE_ALL0 0 /* remove all mappings */
340#define PMAP_REMOVE_SKIPWIRED1 1 /* skip wired mappings */
341
342void pmap_unmap_ptes(struct pmap *, paddr_t);
343int pmap_get_physpage(vaddr_t, int, paddr_t *);
344int pmap_pdes_valid(vaddr_t, pd_entry_t *);
345void pmap_alloc_level(vaddr_t, int, long *);
346
347static inline
348void pmap_sync_flags_pte(struct vm_page *, u_long);
349
350void pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
351void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
352void pmap_tlb_shoottlb(struct pmap *, int);
353#ifdef MULTIPROCESSOR1
354void pmap_tlb_shootwait(void);
355#else
356#define pmap_tlb_shootwait() do { } while (0)
357#endif
358
359/*
360 * p m a p i n l i n e h e l p e r f u n c t i o n s
361 */
362
363/*
364 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
365 * of course the kernel is always loaded
366 */
367
368static inline int
369pmap_is_curpmap(struct pmap *pmap)
370{
371 return((pmap == pmap_kernel()(&kernel_pmap_store)) ||
372 (pmap->pm_pdirpa == (rcr3() & CR3_PADDR0x7ffffffffffff000ULL)));
373}
374
375/*
376 * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
377 */
378
379#ifdef MULTIPROCESSOR1
380static inline int
381pmap_is_active(struct pmap *pmap, struct cpu_info *ci)
382{
383 return pmap == pmap_kernel()(&kernel_pmap_store) || pmap == ci->ci_proc_pmap;
384}
385#endif
386
387static inline u_int
388pmap_pte2flags(u_long pte)
389{
390 return (((pte & PG_U0x0000000000000020UL) ? PG_PMAP_REF0x02000000 : 0) |
391 ((pte & PG_M0x0000000000000040UL) ? PG_PMAP_MOD0x01000000 : 0));
392}
393
394static inline void
395pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
396{
397 if (pte & (PG_U0x0000000000000020UL|PG_M0x0000000000000040UL)) {
398 atomic_setbits_intx86_atomic_setbits_u32(&pg->pg_flags, pmap_pte2flags(pte));
399 }
400}
401
402/*
403 * pmap_map_ptes: map a pmap's PTEs into KVM
404 *
405 * This should not be done for EPT pmaps
406 */
407paddr_t
408pmap_map_ptes(struct pmap *pmap)
409{
410 paddr_t cr3;
411
412 KASSERT(pmap->pm_type != PMAP_TYPE_EPT)((pmap->pm_type != 2) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c"
, 412, "pmap->pm_type != PMAP_TYPE_EPT"))
;
413
414 /* the kernel's pmap is always accessible */
415 if (pmap == pmap_kernel()(&kernel_pmap_store))
416 return 0;
417
418 /*
419 * Lock the target map before switching to its page tables to
420 * guarantee other CPUs have finished changing the tables before
421 * we potentially start caching table and TLB entries.
422 */
423 mtx_enter(&pmap->pm_mtx);
424
425 cr3 = rcr3();
426 KASSERT((cr3 & CR3_PCID) == PCID_KERN ||(((cr3 & 0xfffULL) == 0 || (cr3 & 0xfffULL) == 1) ? (
void)0 : __assert("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c"
, 427, "(cr3 & CR3_PCID) == PCID_KERN || (cr3 & CR3_PCID) == PCID_PROC"
))
427 (cr3 & CR3_PCID) == PCID_PROC)(((cr3 & 0xfffULL) == 0 || (cr3 & 0xfffULL) == 1) ? (
void)0 : __assert("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c"
, 427, "(cr3 & CR3_PCID) == PCID_KERN || (cr3 & CR3_PCID) == PCID_PROC"
))
;
428 if (pmap->pm_pdirpa == (cr3 & CR3_PADDR0x7ffffffffffff000ULL))
429 cr3 = 0;
430 else {
431 cr3 |= cr3_reuse_pcid;
432 lcr3(pmap->pm_pdirpa | cr3_pcid_temp);
433 }
434
435 return cr3;
436}
437
438void
439pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
440{
441 if (pmap != pmap_kernel()(&kernel_pmap_store))
442 mtx_leave(&pmap->pm_mtx);
443
444 if (save_cr3 != 0)
445 lcr3(save_cr3);
446}
447
448int
449pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs)
450{
451 u_long mask, shift;
452 pd_entry_t pde;
453 paddr_t pdpa;
454 int lev;
455
456 pdpa = pm->pm_pdirpa;
457 shift = L4_SHIFT39;
458 mask = L4_MASK0x0000ff8000000000UL;
459 for (lev = PTP_LEVELS4; lev > 0; lev--) {
460 *pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (pdpa))
;
461 *offs = (VA_SIGN_POS(va)((va) & ~0xffff000000000000) & mask) >> shift;
462 pde = (*pd)[*offs];
463
464 /* Large pages are different, break early if we run into one. */
465 if ((pde & (PG_PS0x0000000000000080UL|PG_V0x0000000000000001UL)) != PG_V0x0000000000000001UL)
466 return (lev - 1);
467
468 pdpa = ((*pd)[*offs] & PG_FRAME0x000ffffffffff000UL);
469 /* 4096/8 == 512 == 2^9 entries per level */
470 shift -= 9;
471 mask >>= 9;
472 }
473
474 return (0);
475}
476
477/*
478 * p m a p k e n t e r f u n c t i o n s
479 *
480 * functions to quickly enter/remove pages from the kernel address
481 * space. pmap_kremove is exported to MI kernel. we make use of
482 * the recursive PTE mappings.
483 */
484
485/*
486 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
487 *
488 * => no need to lock anything, assume va is already allocated
489 * => should be faster than normal pmap enter function
490 */
491
492void
493pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
494{
495 pt_entry_t *pte, opte, npte;
496
497 pte = kvtopte(va);
498
499 npte = (pa & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1))) | ((prot & PROT_WRITE0x02) ? PG_RW0x0000000000000002UL : PG_RO0x0000000000000000UL) |
500 ((pa & PMAP_NOCACHE0x1) ? PG_N0x0000000000000010UL : 0) |
501 ((pa & PMAP_WC0x2) ? pmap_pg_wc : 0) | PG_V0x0000000000000001UL;
502
503 /* special 1:1 mappings in the first 2MB must not be global */
504 if (va >= (vaddr_t)NBPD_L2(1ULL << 21))
505 npte |= pg_g_kern;
506
507 if (!(prot & PROT_EXEC0x04))
508 npte |= pg_nx;
509 opte = pmap_pte_set(pte, npte)_atomic_swap_64((pte), (npte));
510#ifdef LARGEPAGES
511 /* XXX For now... */
512 if (opte & PG_PS0x0000000000000080UL)
513 panic("%s: PG_PS", __func__);
514#endif
515 if (pmap_valid_entry(opte)((opte) & 0x0000000000000001UL)) {
516 if (pa & PMAP_NOCACHE0x1 && (opte & PG_N0x0000000000000010UL) == 0)
517 wbinvd_on_all_cpus();
518 /* This shouldn't happen */
519 pmap_tlb_shootpage(pmap_kernel()(&kernel_pmap_store), va, 1);
520 pmap_tlb_shootwait();
521 }
522}
523
524/*
525 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
526 *
527 * => no need to lock anything
528 * => caller must dispose of any vm_page mapped in the va range
529 * => note: not an inline function
530 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
531 * => we assume kernel only unmaps valid addresses and thus don't bother
532 * checking the valid bit before doing TLB flushing
533 */
534
535void
536pmap_kremove(vaddr_t sva, vsize_t len)
537{
538 pt_entry_t *pte, opte;
539 vaddr_t va, eva;
540
541 eva = sva + len;
542
543 for (va = sva; va != eva; va += PAGE_SIZE(1 << 12)) {
544 pte = kvtopte(va);
545
546 opte = pmap_pte_set(pte, 0)_atomic_swap_64((pte), (0));
547#ifdef LARGEPAGES
548 KASSERT((opte & PG_PS) == 0)(((opte & 0x0000000000000080UL) == 0) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c", 548, "(opte & PG_PS) == 0"
))
;
549#endif
550 KASSERT((opte & PG_PVLIST) == 0)(((opte & 0x0000000000000400UL) == 0) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c", 550, "(opte & PG_PVLIST) == 0"
))
;
551 }
552
553 pmap_tlb_shootrange(pmap_kernel()(&kernel_pmap_store), sva, eva, 1);
554 pmap_tlb_shootwait();
555}
556
557/*
558 * pmap_set_pml4_early
559 *
560 * Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned
561 * is the pml4 entry for 'early mappings' (see pmap.h). This function is used
562 * by display drivers that need to map their framebuffers early, before the
563 * pmap is fully initialized (eg, to show panic messages).
564 *
565 * Users of this function must call pmap_clear_pml4_early to remove the
566 * mapping when finished.
567 *
568 * Parameters:
569 * pa: phys addr to map
570 *
571 * Return value:
572 * VA mapping to 'pa'. This mapping is 2GB in size and starts at the base
573 * of the 2MB region containing 'va'.
574 */
575vaddr_t
576pmap_set_pml4_early(paddr_t pa)
577{
578 extern paddr_t early_pte_pages;
579 pt_entry_t *pml4e, *pte;
580 int i, j, off;
581 paddr_t curpa;
582 vaddr_t va;
583
584 pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE0xffffffff80000000);
585 pml4e[PDIR_SLOT_EARLY((511 - 4) - 1)] = (pd_entry_t)early_pte_pages | PG_V0x0000000000000001UL | PG_RW0x0000000000000002UL;
586
587 off = pa & PAGE_MASK_L2((1ULL << 21) - 1);
588 curpa = pa & L2_FRAME((0x0000ff8000000000UL|0x0000007fc0000000UL)|0x000000003fe00000UL
)
;
589
590 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (early_pte_pages))
;
591 memset(pte, 0, 3 * NBPG)__builtin_memset((pte), (0), (3 * (1 << 12)));
592
593 pte[0] = (early_pte_pages + NBPG(1 << 12)) | PG_V0x0000000000000001UL | PG_RW0x0000000000000002UL;
594 pte[1] = (early_pte_pages + 2 * NBPG(1 << 12)) | PG_V0x0000000000000001UL | PG_RW0x0000000000000002UL;
595
596 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (early_pte_pages + (1 << 12)))
;
597 for (i = 0; i < 2; i++) {
598 /* 2 early pages of mappings */
599 for (j = 0; j < 512; j++) {
600 /* j[0..511] : 2MB mappings per page */
601 pte[(i * 512) + j] = curpa | PG_V0x0000000000000001UL | PG_RW0x0000000000000002UL | PG_PS0x0000000000000080UL;
602 curpa += (2 * 1024 * 1024);
603 }
604 }
605
606 va = (vaddr_t)((PDIR_SLOT_EARLY((511 - 4) - 1) * 512ULL) << L3_SHIFT30) + off;
607 return VA_SIGN_NEG(va)((va) | 0xffff000000000000);
608}
609
610/*
611 * pmap_clear_pml4_early
612 *
613 * Clears the mapping previously established with pmap_set_pml4_early.
614 */
615void
616pmap_clear_pml4_early(void)
617{
618 extern paddr_t early_pte_pages;
619 pt_entry_t *pml4e, *pte;
620
621 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (early_pte_pages))
;
622 memset(pte, 0, 3 * NBPG)__builtin_memset((pte), (0), (3 * (1 << 12)));
623
624 pml4e = (pd_entry_t *)pmap_kernel()(&kernel_pmap_store)->pm_pdir;
625 pml4e[PDIR_SLOT_EARLY((511 - 4) - 1)] = 0;
626 tlbflush();
627}
628
629/*
630 * p m a p i n i t f u n c t i o n s
631 *
632 * pmap_bootstrap and pmap_init are called during system startup
633 * to init the pmap module. pmap_bootstrap() does a low level
634 * init just to get things rolling. pmap_init() finishes the job.
635 */
636
637/*
638 * pmap_bootstrap: get the system in a state where it can run with VM
639 * properly enabled (called before main()). the VM system is
640 * fully init'd later...
641 */
642
643paddr_t
644pmap_bootstrap(paddr_t first_avail, paddr_t max_pa)
645{
646 vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS0xffff800000000000;
647 struct pmap *kpm;
648 int curslot, i, j, p;
649 long ndmpdp;
650 paddr_t dmpd, dmpdp, start_cur, cur_pa;
651 vaddr_t kva, kva_end;
652 pt_entry_t *pml3, *pml2;
653
654 /*
655 * define the boundaries of the managed kernel virtual address
656 * space.
657 */
658
659 virtual_avail = kva_start; /* first free KVA */
660
661 /*
662 * If PKU is available, initialize PROT_EXEC entry correctly,
663 * and enable the feature before it gets used
664 * XXX Some Hypervisors forget to save/restore PKU
665 */
666 if (cpuid_level >= 0x7) {
667 uint32_t ecx, dummy;
668
669 CPUID_LEAF(0x7, 0, dummy, dummy, ecx, dummy)__asm volatile("cpuid" : "=a" (dummy), "=b" (dummy), "=c" (ecx
), "=d" (dummy) : "a" (0x7), "c" (0))
;
670 if (ecx & SEFF0ECX_PKU0x00000008) {
671 lcr4(rcr4() | CR4_PKE0x00400000);
672 pg_xo = PG_XO0x0800000000000000UL;
673 }
674 }
675
676 /*
677 * set up protection_codes: we need to be able to convert from
678 * a MI protection code (some combo of VM_PROT...) to something
679 * we can jam into a i386 PTE.
680 */
681
682 protection_codes[PROT_NONE0x00] = pg_nx; /* --- */
683 protection_codes[PROT_EXEC0x04] = pg_xo; ; /* --x */
684 protection_codes[PROT_READ0x01] = PG_RO0x0000000000000000UL | pg_nx; /* -r- */
685 protection_codes[PROT_READ0x01 | PROT_EXEC0x04] = PG_RO0x0000000000000000UL; /* -rx */
686 protection_codes[PROT_WRITE0x02] = PG_RW0x0000000000000002UL | pg_nx; /* w-- */
687 protection_codes[PROT_WRITE0x02 | PROT_EXEC0x04] = PG_RW0x0000000000000002UL; /* w-x */
688 protection_codes[PROT_WRITE0x02 | PROT_READ0x01] = PG_RW0x0000000000000002UL | pg_nx; /* wr- */
689 protection_codes[PROT_READ0x01 | PROT_WRITE0x02 | PROT_EXEC0x04] = PG_RW0x0000000000000002UL; /* wrx */
690
691 /*
692 * now we init the kernel's pmap
693 *
694 * the kernel pmap's pm_obj is not used for much. however, in
695 * user pmaps the pm_obj contains the list of active PTPs.
696 * the pm_obj currently does not have a pager.
697 */
698
699 kpm = pmap_kernel()(&kernel_pmap_store);
700 for (i = 0; i < PTP_LEVELS4 - 1; i++) {
701 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
702 kpm->pm_ptphint[i] = NULL((void *)0);
703 }
704 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list))__builtin_memset((&kpm->pm_list), (0), (sizeof(kpm->
pm_list)))
; /* pm_list not used */
705 kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE0xffffffff80000000);
706 kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
707 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
708 atop(kva_start - VM_MIN_KERNEL_ADDRESS)((kva_start - 0xffff800000000000) >> 12);
709 /*
710 * the above is just a rough estimate and not critical to the proper
711 * operation of the system.
712 */
713
714 kpm->pm_type = PMAP_TYPE_NORMAL1;
715
716 curpcb({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curpcb
->pcb_pmap = kpm; /* proc0's pcb */
717
718 /*
719 * Configure and enable PCID use if supported.
720 * Currently we require INVPCID support.
721 */
722 if ((cpu_ecxfeature & CPUIDECX_PCID0x00020000) && cpuid_level >= 0x07) {
723 uint32_t ebx, dummy;
724 CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy)__asm volatile("cpuid" : "=a" (dummy), "=b" (ebx), "=c" (dummy
), "=d" (dummy) : "a" (0x7), "c" (0))
;
725 if (ebx & SEFF0EBX_INVPCID0x00000400) {
726 pmap_use_pcid = 1;
727 /*
728 * We cannot use global mappings because
729 * invpcid function 0 does not invalidate global
730 * mappings. The hardware can cache kernel
731 * mappings based on PCID_KERN, i.e. there is no
732 * need for global mappings.
733 */
734 pg_g_kern = 0;
735 lcr4( rcr4() | CR4_PCIDE0x00020000 );
736 cr3_pcid_proc = PCID_PROC1;
737 cr3_pcid_temp = PCID_TEMP3;
738 cr3_reuse_pcid = CR3_REUSE_PCID(1ULL << 63);
739 cr3_pcid_proc_intel = PCID_PROC_INTEL2;
740 }
741 }
742
743 /*
744 * Add PG_G attribute to already mapped kernel pages. pg_g_kern
745 * is calculated in locore0.S and may be set to:
746 *
747 * 0 if this CPU does not safely support global pages in the kernel
748 * (Intel/Meltdown)
749 * PG_G if this CPU does safely support global pages in the kernel
750 * (AMD)
751 */
752#if KERNBASE0xffffffff80000000 == VM_MIN_KERNEL_ADDRESS0xffff800000000000
753 for (kva = VM_MIN_KERNEL_ADDRESS0xffff800000000000 ; kva < virtual_avail ;
754#else
755 kva_end = roundup((vaddr_t)&end, PAGE_SIZE)(((((vaddr_t)&end)+(((1 << 12))-1))/((1 << 12
)))*((1 << 12)))
;
756 for (kva = KERNBASE0xffffffff80000000; kva < kva_end ;
757#endif
758 kva += PAGE_SIZE(1 << 12)) {
759 unsigned long p1i = pl1_i(kva)(((((kva) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
;
760 if (pmap_valid_entry(PTE_BASE[p1i])((((pt_entry_t *) (255 * (1ULL << 39)))[p1i]) & 0x0000000000000001UL
)
)
761 PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[p1i] |= pg_g_kern;
762 }
763
764 /*
765 * Map the direct map. The first 4GB were mapped in locore, here
766 * we map the rest if it exists. We actually use the direct map
767 * here to set up the page tables, we're assuming that we're still
768 * operating in the lower 4GB of memory.
769 *
770 * Map (up to) the first 512GB of physical memory first. This part
771 * is handled differently than physical memory > 512GB since we have
772 * already mapped part of this range in locore0.
773 */
774 ndmpdp = (max_pa + NBPD_L3(1ULL << 30) - 1) >> L3_SHIFT30;
775 if (ndmpdp < NDML2_ENTRIES4)
776 ndmpdp = NDML2_ENTRIES4; /* At least 4GB */
777 if (ndmpdp > 512)
778 ndmpdp = 512; /* At most 512GB */
779
780 dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT(511 - 4)] & PG_FRAME0x000ffffffffff000UL;
781
782 dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE(1 << 12);
783
784 for (i = NDML2_ENTRIES4; i < NPDPG((1 << 12) / sizeof (pd_entry_t)) * ndmpdp; i++) {
785 paddr_t pdp;
786 vaddr_t va;
787
788 pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
789 va = PMAP_DIRECT_MAP(pdp)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (pdp))
;
790
791 *((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT21);
792 *((pd_entry_t *)va) |= PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL | PG_PS0x0000000000000080UL | pg_g_kern | PG_U0x0000000000000020UL |
793 PG_M0x0000000000000040UL | pg_nx;
794 }
795
796 for (i = NDML2_ENTRIES4; i < ndmpdp; i++) {
797 paddr_t pdp;
798 vaddr_t va;
799
800 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
801 va = PMAP_DIRECT_MAP(pdp)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (pdp))
;
802
803 *((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT12);
804 *((pd_entry_t *)va) |= PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL | PG_U0x0000000000000020UL | PG_M0x0000000000000040UL | pg_nx;
805 }
806
807 kpm->pm_pdir[PDIR_SLOT_DIRECT(511 - 4)] = dmpdp | PG_V0x0000000000000001UL | PG_KW0x0000000000000002UL | PG_U0x0000000000000020UL |
808 PG_M0x0000000000000040UL | pg_nx;
809
810 /* Map any remaining physical memory > 512GB */
811 for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT4 ; curslot++) {
812 /*
813 * Start of current range starts at PA (curslot) * 512GB
814 */
815 start_cur = (paddr_t)(curslot * NBPD_L4(1ULL << 39));
816 if (max_pa > start_cur) {
817 /* Next 512GB, new PML4e and L3(512GB) page */
818 dmpd = first_avail; first_avail += PAGE_SIZE(1 << 12);
819 pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (dmpd))
;
820 kpm->pm_pdir[PDIR_SLOT_DIRECT(511 - 4) + curslot] = dmpd |
821 PG_KW0x0000000000000002UL | PG_V0x0000000000000001UL | PG_U0x0000000000000020UL | PG_M0x0000000000000040UL | pg_nx;
822
823 /* Calculate full 1GB pages in this 512GB region */
824 p = ((max_pa - start_cur) >> L3_SHIFT30);
825
826 /* Check if a partial (<1GB) page remains */
827 if (max_pa & L2_MASK0x000000003fe00000UL)
828 p++;
829
830 /*
831 * Handle the case where this range is full and there
832 * is still more memory after (p would be > 512).
833 */
834 if (p > NPDPG((1 << 12) / sizeof (pd_entry_t)))
835 p = NPDPG((1 << 12) / sizeof (pd_entry_t));
836
837 /* Allocate 'p' L2(1GB) pages and populate */
838 for (i = 0; i < p; i++) {
839 dmpd = first_avail; first_avail += PAGE_SIZE(1 << 12);
840 pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (dmpd))
;
841 pml3[i] = dmpd |
842 PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL | PG_U0x0000000000000020UL | PG_M0x0000000000000040UL | pg_nx;
843
844 cur_pa = start_cur + (i << L3_SHIFT30);
845 j = 0;
846
847 while (cur_pa < max_pa && j < NPDPG((1 << 12) / sizeof (pd_entry_t))) {
848 pml2[j] = curslot * NBPD_L4(1ULL << 39) +
849 (uint64_t)i * NBPD_L3(1ULL << 30) +
850 (uint64_t)j * NBPD_L2(1ULL << 21);
851 pml2[j] |= PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL | pg_g_kern |
852 PG_U0x0000000000000020UL | PG_M0x0000000000000040UL | pg_nx | PG_PS0x0000000000000080UL;
853 cur_pa += NBPD_L2(1ULL << 21);
854 j++;
855 }
856 }
857 }
858 }
859
860 tlbflush();
861
862 msgbuf_vaddr = virtual_avail;
863 virtual_avail += round_page(MSGBUFSIZE)((((32 * (1 << 12))) + ((1 << 12) - 1)) & ~((
1 << 12) - 1))
;
864
865 idt_vaddr = virtual_avail;
866 virtual_avail += 2 * PAGE_SIZE(1 << 12);
867 idt_paddr = first_avail; /* steal a page */
868 first_avail += 2 * PAGE_SIZE(1 << 12);
869
870#if defined(MULTIPROCESSOR1) || \
871 (NACPI1 > 0 && !defined(SMALL_KERNEL))
872 /*
873 * Grab a page below 4G for things that need it (i.e.
874 * having an initial %cr3 for the MP trampoline).
875 */
876 lo32_vaddr = virtual_avail;
877 virtual_avail += PAGE_SIZE(1 << 12);
878 lo32_paddr = first_avail;
879 first_avail += PAGE_SIZE(1 << 12);
880#endif
881
882 /*
883 * init the global lists.
884 */
885 LIST_INIT(&pmaps)do { ((&pmaps)->lh_first) = ((void *)0); } while (0);
886
887 /*
888 * initialize the pmap pools.
889 */
890
891 pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM0xa, 0,
892 "pmappl", NULL((void *)0));
893 pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM0xa, 0,
894 "pvpl", &pool_allocator_single);
895 pool_sethiwat(&pmap_pv_pool, 32 * 1024);
896
897 /*
898 * initialize the PDE pool.
899 */
900
901 pool_init(&pmap_pdp_pool, PAGE_SIZE(1 << 12), 0, IPL_VM0xa, 0,
902 "pdppl", &pool_allocator_single);
903
904 kpm->pm_pdir_intel = NULL((void *)0);
905 kpm->pm_pdirpa_intel = 0;
906
907 /*
908 * ensure the TLB is sync'd with reality by flushing it...
909 */
910
911 tlbflush();
912
913 return first_avail;
914}
915
916void
917pmap_init_percpu(void)
918{
919 pool_cache_init(&pmap_pv_pool);
920}
921
922/*
923 * pmap_randomize
924 *
925 * Randomizes the location of the kernel pmap
926 */
927void
928pmap_randomize(void)
929{
930 pd_entry_t *pml4va, *oldpml4va;
931 paddr_t pml4pa;
932 int i;
933
934 pml4va = km_alloc(PAGE_SIZE(1 << 12), &kv_page, &kp_zero, &kd_nowait);
935 if (pml4va == NULL((void *)0))
936 panic("%s: km_alloc failed", __func__);
937
938 /* Copy old PML4 page to new one */
939 oldpml4va = pmap_kernel()(&kernel_pmap_store)->pm_pdir;
940 memcpy(pml4va, oldpml4va, PAGE_SIZE)__builtin_memcpy((pml4va), (oldpml4va), ((1 << 12)));
941
942 /* Switch to new PML4 */
943 pmap_extract(pmap_kernel()(&kernel_pmap_store), (vaddr_t)pml4va, &pml4pa);
944 lcr3(pml4pa);
945
946 /* Fixup pmap_kernel and proc0's %cr3 */
947 pmap_kernel()(&kernel_pmap_store)->pm_pdirpa = pml4pa;
948 pmap_kernel()(&kernel_pmap_store)->pm_pdir = pml4va;
949 proc0.p_addr->u_pcb.pcb_cr3 = pml4pa;
950
951 /* Fixup recursive PTE PML4E slot. We are only changing the PA */
952 pml4va[PDIR_SLOT_PTE255] = pml4pa | (pml4va[PDIR_SLOT_PTE255] & ~PG_FRAME0x000ffffffffff000UL);
953
954 for (i = 0; i < NPDPG((1 << 12) / sizeof (pd_entry_t)); i++) {
955 /* PTE slot already handled earlier */
956 if (i == PDIR_SLOT_PTE255)
957 continue;
958
959 if (pml4va[i] & PG_FRAME0x000ffffffffff000UL)
960 pmap_randomize_level(&pml4va[i], 3);
961 }
962
963 /* Wipe out bootstrap PML4 */
964 memset(oldpml4va, 0, PAGE_SIZE)__builtin_memset((oldpml4va), (0), ((1 << 12)));
965 tlbflush();
966}
967
968void
969pmap_randomize_level(pd_entry_t *pde, int level)
970{
971 pd_entry_t *new_pd_va;
972 paddr_t old_pd_pa, new_pd_pa;
973 vaddr_t old_pd_va;
974 struct vm_page *pg;
975 int i;
976
977 if (level == 0)
978 return;
979
980 if (level < PTP_LEVELS4 - 1 && (*pde & PG_PS0x0000000000000080UL))
981 return;
982
983 new_pd_va = km_alloc(PAGE_SIZE(1 << 12), &kv_page, &kp_zero, &kd_nowait);
984 if (new_pd_va == NULL((void *)0))
985 panic("%s: cannot allocate page for L%d page directory",
986 __func__, level);
987
988 old_pd_pa = *pde & PG_FRAME0x000ffffffffff000UL;
989 old_pd_va = PMAP_DIRECT_MAP(old_pd_pa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (old_pd_pa))
;
990 pmap_extract(pmap_kernel()(&kernel_pmap_store), (vaddr_t)new_pd_va, &new_pd_pa);
991 memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE)__builtin_memcpy((new_pd_va), ((void *)old_pd_va), ((1 <<
12)))
;
992 *pde = new_pd_pa | (*pde & ~PG_FRAME0x000ffffffffff000UL);
993
994 tlbflush();
995 memset((void *)old_pd_va, 0, PAGE_SIZE)__builtin_memset(((void *)old_pd_va), (0), ((1 << 12)));
996
997 pg = PHYS_TO_VM_PAGE(old_pd_pa);
998 if (pg != NULL((void *)0)) {
999 pg->wire_count--;
1000 pmap_kernel()(&kernel_pmap_store)->pm_stats.resident_count--;
1001 if (pg->wire_count <= 1)
1002 uvm_pagefree(pg);
1003 }
1004
1005 for (i = 0; i < NPDPG((1 << 12) / sizeof (pd_entry_t)); i++)
1006 if (new_pd_va[i] & PG_FRAME0x000ffffffffff000UL)
1007 pmap_randomize_level(&new_pd_va[i], level - 1);
1008}
1009
1010/*
1011 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1012 * trampoline code can be entered.
1013 */
1014paddr_t
1015pmap_prealloc_lowmem_ptps(paddr_t first_avail)
1016{
1017 pd_entry_t *pdes;
1018 int level;
1019 paddr_t newp;
1020
1021 pdes = pmap_kernel()(&kernel_pmap_store)->pm_pdir;
1022 level = PTP_LEVELS4;
1023 for (;;) {
1024 newp = first_avail; first_avail += PAGE_SIZE(1 << 12);
1025 memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE)__builtin_memset(((void *)((vaddr_t)(((((511 - 4) * (1ULL <<
39))) | 0xffff000000000000)) + (newp))), (0), ((1 << 12
)))
;
1026 pdes[pl_i(0, level)(((((0) & ~0xffff000000000000)) & ptp_masks[(level)-1
]) >> ptp_shifts[(level)-1])
] = (newp & PG_FRAME0x000ffffffffff000UL) | PG_V0x0000000000000001UL | PG_RW0x0000000000000002UL;
1027 level--;
1028 if (level <= 1)
1029 break;
1030 pdes = normal_pdes[level - 2];
1031 }
1032
1033 return first_avail;
1034}
1035
1036/*
1037 * pmap_init: no further initialization required on this platform
1038 */
1039void
1040pmap_init(void)
1041{
1042 pmap_initialized = 1;
1043}
1044
1045/*
1046 * p v _ e n t r y f u n c t i o n s
1047 */
1048
1049/*
1050 * main pv_entry manipulation functions:
1051 * pmap_enter_pv: enter a mapping onto a pv list
1052 * pmap_remove_pv: remove a mapping from a pv list
1053 */
1054
1055/*
1056 * pmap_enter_pv: enter a mapping onto a pv list
1057 *
1058 * => caller should adjust ptp's wire_count before calling
1059 *
1060 * pve: preallocated pve for us to use
1061 * ptp: PTP in pmap that maps this VA
1062 */
1063
1064void
1065pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap,
1066 vaddr_t va, struct vm_page *ptp)
1067{
1068 pve->pv_pmap = pmap;
1069 pve->pv_va = va;
1070 pve->pv_ptp = ptp; /* NULL for kernel pmap */
1071 mtx_enter(&pg->mdpage.pv_mtx);
1072 pve->pv_next = pg->mdpage.pv_list; /* add to ... */
1073 pg->mdpage.pv_list = pve; /* ... list */
1074 mtx_leave(&pg->mdpage.pv_mtx);
1075}
1076
1077/*
1078 * pmap_remove_pv: try to remove a mapping from a pv_list
1079 *
1080 * => caller should adjust ptp's wire_count and free PTP if needed
1081 * => we return the removed pve
1082 */
1083
1084struct pv_entry *
1085pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va)
1086{
1087 struct pv_entry *pve, **prevptr;
1088
1089 mtx_enter(&pg->mdpage.pv_mtx);
1090 prevptr = &pg->mdpage.pv_list;
1091 while ((pve = *prevptr) != NULL((void *)0)) {
1092 if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */
1093 *prevptr = pve->pv_next; /* remove it! */
1094 break;
1095 }
1096 prevptr = &pve->pv_next; /* previous pointer */
1097 }
1098 mtx_leave(&pg->mdpage.pv_mtx);
1099 return(pve); /* return removed pve */
1100}
1101
1102/*
1103 * p t p f u n c t i o n s
1104 */
1105
1106struct vm_page *
1107pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1108{
1109 int lidx = level - 1;
1110 struct vm_page *pg;
1111
1112 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1113 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])((pmap->pm_ptphint[lidx])->phys_addr))
1114 return (pmap->pm_ptphint[lidx]);
1115
1116 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)((((((va) & ~0xffff000000000000)) & ptp_masks[((level
)+1)-1]) >> ptp_shifts[((level)+1)-1]) * (1 << 12
))
);
1117
1118 return pg;
1119}
1120
1121void
1122pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
1123 struct pg_to_free *pagelist)
1124{
1125 int lidx;
1126 struct uvm_object *obj;
1127
1128 lidx = level - 1;
1129
1130 obj = &pmap->pm_obj[lidx];
1131 pmap->pm_stats.resident_count--;
1132 if (pmap->pm_ptphint[lidx] == ptp)
1133 pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt)uvm_objtree_RBT_ROOT(&obj->memt);
1134 ptp->wire_count = 0;
1135 uvm_pagerealloc(ptp, NULL((void *)0), 0);
1136 TAILQ_INSERT_TAIL(pagelist, ptp, pageq)do { (ptp)->pageq.tqe_next = ((void *)0); (ptp)->pageq.
tqe_prev = (pagelist)->tqh_last; *(pagelist)->tqh_last =
(ptp); (pagelist)->tqh_last = &(ptp)->pageq.tqe_next
; } while (0)
;
1137}
1138
1139void
1140pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1141 struct pg_to_free *pagelist)
1142{
1143 unsigned long index;
1144 int level;
1145 vaddr_t invaladdr;
1146
1147 level = 1;
1148 do {
1149 pmap_freepage(pmap, ptp, level, pagelist);
1150 index = pl_i(va, level + 1)(((((va) & ~0xffff000000000000)) & ptp_masks[(level +
1)-1]) >> ptp_shifts[(level + 1)-1])
;
1151 pmap_pte_set(&normal_pdes[level - 1][index], 0)_atomic_swap_64((&normal_pdes[level - 1][index]), (0));
1152 if (level == PTP_LEVELS4 - 1 && pmap->pm_pdir_intel != NULL((void *)0)) {
1153 /* Zap special meltdown PML4e */
1154 pmap_pte_set(&pmap->pm_pdir_intel[index], 0)_atomic_swap_64((&pmap->pm_pdir_intel[index]), (0));
1155 DPRINTF("%s: cleared meltdown PML4e @ index %lu "
1156 "(va range start 0x%llx)\n", __func__, index,
1157 (uint64_t)(index << L4_SHIFT));
1158 }
1159 invaladdr = level == 1 ? (vaddr_t)PTE_BASE((pt_entry_t *) (255 * (1ULL << 39))) :
1160 (vaddr_t)normal_pdes[level - 2];
1161 pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE(1 << 12),
1162 pmap_is_curpmap(curpcb({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curpcb
->pcb_pmap));
1163 if (level < PTP_LEVELS4 - 1) {
1164 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1165 ptp->wire_count--;
1166 if (ptp->wire_count > 1)
1167 break;
1168 }
1169 } while (++level < PTP_LEVELS4);
1170}
1171
1172/*
1173 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1174 *
1175 * => pmap should NOT be pmap_kernel()
1176 */
1177
1178struct vm_page *
1179pmap_get_ptp(struct pmap *pmap, vaddr_t va)
1180{
1181 struct vm_page *ptp, *pptp;
1182 int i;
1183 unsigned long index;
1184 pd_entry_t *pva, *pva_intel;
1185 paddr_t ppa, pa;
1186 struct uvm_object *obj;
1187
1188 ptp = NULL((void *)0);
1189 pa = (paddr_t)-1;
1190
1191 /*
1192 * Loop through all page table levels seeing if we need to
1193 * add a new page to that level.
1194 */
1195 for (i = PTP_LEVELS4; i > 1; i--) {
1196 /*
1197 * Save values from previous round.
1198 */
1199 pptp = ptp;
1200 ppa = pa;
1201
1202 index = pl_i(va, i)(((((va) & ~0xffff000000000000)) & ptp_masks[(i)-1]) >>
ptp_shifts[(i)-1])
;
1203 pva = normal_pdes[i - 2];
1204
1205 if (pmap_valid_entry(pva[index])((pva[index]) & 0x0000000000000001UL)) {
1206 ppa = pva[index] & PG_FRAME0x000ffffffffff000UL;
1207 ptp = NULL((void *)0);
1208 continue;
1209 }
1210
1211 obj = &pmap->pm_obj[i-2];
1212 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1)((((((va) & ~0xffff000000000000)) & ptp_masks[((i - 1
)+1)-1]) >> ptp_shifts[((i - 1)+1)-1]) * (1 << 12
))
, NULL((void *)0),
1213 UVM_PGA_USERESERVE0x0001|UVM_PGA_ZERO0x0002);
1214
1215 if (ptp == NULL((void *)0))
1216 return NULL((void *)0);
1217
1218 atomic_clearbits_intx86_atomic_clearbits_u32(&ptp->pg_flags, PG_BUSY0x00000001);
1219 ptp->wire_count = 1;
1220 pmap->pm_ptphint[i - 2] = ptp;
1221 pa = VM_PAGE_TO_PHYS(ptp)((ptp)->phys_addr);
1222 pva[index] = (pd_entry_t) (pa | PG_u0x0000000000000004UL | PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL);
1223
1224 /*
1225 * Meltdown Special case - if we are adding a new PML4e for
1226 * usermode addresses, just copy the PML4e to the U-K page
1227 * table.
1228 */
1229 if (pmap->pm_pdir_intel != NULL((void *)0) && i == PTP_LEVELS4 &&
1230 va < VM_MAXUSER_ADDRESS0x00007f7fffffc000) {
1231 pva_intel = pmap->pm_pdir_intel;
1232 pva_intel[index] = pva[index];
1233 DPRINTF("%s: copying usermode PML4e (content=0x%llx) "
1234 "from 0x%llx -> 0x%llx\n", __func__, pva[index],
1235 (uint64_t)&pva[index], (uint64_t)&pva_intel[index]);
1236 }
1237
1238 pmap->pm_stats.resident_count++;
1239 /*
1240 * If we're not in the top level, increase the
1241 * wire count of the parent page.
1242 */
1243 if (i < PTP_LEVELS4) {
1244 if (pptp == NULL((void *)0))
1245 pptp = pmap_find_ptp(pmap, va, ppa, i);
1246#ifdef DIAGNOSTIC1
1247 if (pptp == NULL((void *)0))
1248 panic("%s: pde page disappeared", __func__);
1249#endif
1250 pptp->wire_count++;
1251 }
1252 }
1253
1254 /*
1255 * ptp is not NULL if we just allocated a new ptp. If it's
1256 * still NULL, we must look up the existing one.
1257 */
1258 if (ptp == NULL((void *)0)) {
1259 ptp = pmap_find_ptp(pmap, va, ppa, 1);
1260#ifdef DIAGNOSTIC1
1261 if (ptp == NULL((void *)0)) {
1262 printf("va %lx ppa %lx\n", (unsigned long)va,
1263 (unsigned long)ppa);
1264 panic("%s: unmanaged user PTP", __func__);
1265 }
1266#endif
1267 }
1268
1269 pmap->pm_ptphint[0] = ptp;
1270 return(ptp);
1271}
1272
1273/*
1274 * p m a p l i f e c y c l e f u n c t i o n s
1275 */
1276
1277/*
1278 * pmap_pdp_ctor: constructor for the PDP cache.
1279 */
1280
1281void
1282pmap_pdp_ctor(pd_entry_t *pdir)
1283{
1284 paddr_t pdirpa;
1285 int npde, i;
1286 struct pmap *kpm = pmap_kernel()(&kernel_pmap_store);
1287
1288 /* fetch the physical address of the page directory. */
1289 (void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa);
1290
1291 /* zero init area */
1292 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t))__builtin_memset((pdir), (0), (255 * sizeof(pd_entry_t)));
1293
1294 /* put in recursive PDE to map the PTEs */
1295 pdir[PDIR_SLOT_PTE255] = pdirpa | PG_V0x0000000000000001UL | PG_KW0x0000000000000002UL | pg_nx;
1296
1297 npde = nkptp[PTP_LEVELS4 - 1];
1298
1299 /* put in kernel VM PDEs */
1300 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],__builtin_memcpy((&pdir[256]), (&((pd_entry_t *)((char
*)((pd_entry_t *)((char *)((pd_entry_t *)((char *)((pt_entry_t
*) (255 * (1ULL << 39))) + 255 * (1ULL << 30))) +
255 * (1ULL << 21))) + 255 * (1ULL << 12)))[256]
), (npde * sizeof(pd_entry_t)))
1301 npde * sizeof(pd_entry_t))__builtin_memcpy((&pdir[256]), (&((pd_entry_t *)((char
*)((pd_entry_t *)((char *)((pd_entry_t *)((char *)((pt_entry_t
*) (255 * (1ULL << 39))) + 255 * (1ULL << 30))) +
255 * (1ULL << 21))) + 255 * (1ULL << 12)))[256]
), (npde * sizeof(pd_entry_t)))
;
1302
1303 /* zero the rest */
1304 memset(&pdir[PDIR_SLOT_KERN + npde], 0,__builtin_memset((&pdir[256 + npde]), (0), ((((1 <<
12) / (sizeof (pd_entry_t))) - (256 + npde)) * sizeof(pd_entry_t
)))
1305 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t))__builtin_memset((&pdir[256 + npde]), (0), ((((1 <<
12) / (sizeof (pd_entry_t))) - (256 + npde)) * sizeof(pd_entry_t
)))
;
1306
1307 for (i = 0; i < NUM_L4_SLOT_DIRECT4; i++)
1308 pdir[PDIR_SLOT_DIRECT(511 - 4) + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT(511 - 4) + i];
1309
1310#if VM_MIN_KERNEL_ADDRESS0xffff800000000000 != KERNBASE0xffffffff80000000
1311 pdir[pl4_pi(KERNBASE)(((((0xffffffff80000000) & ~0xffff000000000000)) & 0x0000ff8000000000UL
) >> 39)
] = PDP_BASE((pd_entry_t *)((char *)((pd_entry_t *)((char *)((pd_entry_t *
)((char *)((pt_entry_t *) (255 * (1ULL << 39))) + 255 *
(1ULL << 30))) + 255 * (1ULL << 21))) + 255 * (1ULL
<< 12)))
[pl4_pi(KERNBASE)(((((0xffffffff80000000) & ~0xffff000000000000)) & 0x0000ff8000000000UL
) >> 39)
];
1312#endif
1313}
1314
1315void
1316pmap_pdp_ctor_intel(pd_entry_t *pdir)
1317{
1318 struct pmap *kpm = pmap_kernel()(&kernel_pmap_store);
1319
1320 /* Copy PML4es from pmap_kernel's U-K view */
1321 memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE)__builtin_memcpy((pdir), (kpm->pm_pdir_intel), ((1 <<
12)))
;
1322}
1323
1324/*
1325 * pmap_create: create a pmap
1326 *
1327 * => note: old pmap interface took a "size" args which allowed for
1328 * the creation of "software only" pmaps (not in bsd).
1329 */
1330
1331struct pmap *
1332pmap_create(void)
1333{
1334 struct pmap *pmap;
1335 int i;
1336
1337 pmap = pool_get(&pmap_pmap_pool, PR_WAITOK0x0001);
1338
1339 mtx_init(&pmap->pm_mtx, IPL_VM)do { (void)(((void *)0)); (void)(0); __mtx_init((&pmap->
pm_mtx), ((((0xa)) > 0x0 && ((0xa)) < 0x9) ? 0x9
: ((0xa)))); } while (0)
;
1340
1341 /* init uvm_object */
1342 for (i = 0; i < PTP_LEVELS4 - 1; i++) {
1343 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, 1);
1344 pmap->pm_ptphint[i] = NULL((void *)0);
1345 }
1346 pmap->pm_stats.wired_count = 0;
1347 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */
1348 pmap->pm_type = PMAP_TYPE_NORMAL1;
1349 pmap->eptp = 0;
1350
1351 /* allocate PDP */
1352
1353 /*
1354 * note that there is no need to splvm to protect us from
1355 * malloc since malloc allocates out of a submap and we should
1356 * have already allocated kernel PTPs to cover the range...
1357 */
1358
1359 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK0x0001);
1360 pmap_pdp_ctor(pmap->pm_pdir);
1361
1362 pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE255] & PG_FRAME0x000ffffffffff000UL;
1363
1364 /*
1365 * Intel CPUs need a special page table to be used during usermode
1366 * execution, one that lacks all kernel mappings.
1367 */
1368 if (cpu_meltdown) {
1369 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK0x0001);
1370 pmap_pdp_ctor_intel(pmap->pm_pdir_intel);
1371 pmap->pm_stats.resident_count++;
1372 if (!pmap_extract(pmap_kernel()(&kernel_pmap_store), (vaddr_t)pmap->pm_pdir_intel,
1373 &pmap->pm_pdirpa_intel))
1374 panic("%s: unknown PA mapping for meltdown PML4",
1375 __func__);
1376 } else {
1377 pmap->pm_pdir_intel = NULL((void *)0);
1378 pmap->pm_pdirpa_intel = 0;
1379 }
1380
1381 mtx_enter(&pmaps_lock);
1382 LIST_INSERT_HEAD(&pmaps, pmap, pm_list)do { if (((pmap)->pm_list.le_next = (&pmaps)->lh_first
) != ((void *)0)) (&pmaps)->lh_first->pm_list.le_prev
= &(pmap)->pm_list.le_next; (&pmaps)->lh_first
= (pmap); (pmap)->pm_list.le_prev = &(&pmaps)->
lh_first; } while (0)
;
1383 mtx_leave(&pmaps_lock);
1384 return (pmap);
1385}
1386
1387/*
1388 * pmap_destroy: drop reference count on pmap. free pmap if
1389 * reference count goes to zero.
1390 */
1391
1392void
1393pmap_destroy(struct pmap *pmap)
1394{
1395 struct vm_page *pg;
1396 int refs;
1397 int i;
1398
1399 /*
1400 * drop reference count
1401 */
1402
1403 refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs)_atomic_sub_int_nv((&pmap->pm_obj[0].uo_refs), 1);
1404 if (refs > 0) {
1405 return;
1406 }
1407
1408 /*
1409 * remove it from global list of pmaps
1410 */
1411 mtx_enter(&pmaps_lock);
1412 LIST_REMOVE(pmap, pm_list)do { if ((pmap)->pm_list.le_next != ((void *)0)) (pmap)->
pm_list.le_next->pm_list.le_prev = (pmap)->pm_list.le_prev
; *(pmap)->pm_list.le_prev = (pmap)->pm_list.le_next; (
(pmap)->pm_list.le_prev) = ((void *)-1); ((pmap)->pm_list
.le_next) = ((void *)-1); } while (0)
;
1413 mtx_leave(&pmaps_lock);
1414
1415 /*
1416 * free any remaining PTPs
1417 */
1418
1419 for (i = 0; i < PTP_LEVELS4 - 1; i++) {
1420 while ((pg = RBT_ROOT(uvm_objtree,uvm_objtree_RBT_ROOT(&pmap->pm_obj[i].memt)
1421 &pmap->pm_obj[i].memt)uvm_objtree_RBT_ROOT(&pmap->pm_obj[i].memt)) != NULL((void *)0)) {
1422 KASSERT((pg->pg_flags & PG_BUSY) == 0)(((pg->pg_flags & 0x00000001) == 0) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c", 1422,
"(pg->pg_flags & PG_BUSY) == 0"))
;
1423
1424 pg->wire_count = 0;
1425 pmap->pm_stats.resident_count--;
1426
1427 uvm_pagefree(pg);
1428 }
1429 }
1430
1431 pool_put(&pmap_pdp_pool, pmap->pm_pdir);
1432
1433 if (pmap->pm_pdir_intel != NULL((void *)0)) {
1434 pmap->pm_stats.resident_count--;
1435 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
1436 }
1437
1438 pool_put(&pmap_pmap_pool, pmap);
1439}
1440
1441/*
1442 * Add a reference to the specified pmap.
1443 */
1444
1445void
1446pmap_reference(struct pmap *pmap)
1447{
1448 atomic_inc_int(&pmap->pm_obj[0].uo_refs)_atomic_inc_int(&pmap->pm_obj[0].uo_refs);
1449}
1450
1451/*
1452 * pmap_activate: activate a process' pmap (fill in %cr3)
1453 *
1454 * => called from cpu_fork() and when switching pmaps during exec
1455 * => if p is the curproc, then load it into the MMU
1456 */
1457
1458void
1459pmap_activate(struct proc *p)
1460{
1461 struct pcb *pcb = &p->p_addr->u_pcb;
1462 struct pmap *pmap = p->p_vmspace->vm_map.pmap;
1463
1464 pcb->pcb_pmap = pmap;
1465 pcb->pcb_cr3 = pmap->pm_pdirpa;
1466 pcb->pcb_cr3 |= (pmap != pmap_kernel()(&kernel_pmap_store)) ? cr3_pcid_proc :
1467 (PCID_KERN0 | cr3_reuse_pcid);
1468
1469 if (p != curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curproc
)
1470 return;
1471
1472 if ((p->p_flag & P_SYSTEM0x00000200) == 0) {
1473 struct cpu_info *self = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})
;
1474
1475 /* mark the pmap in use by this processor */
1476 self->ci_proc_pmap = pmap;
1477
1478 /* in case we return to userspace without context switching */
1479 if (cpu_meltdown) {
1480 self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid;
1481 self->ci_user_cr3 = pmap->pm_pdirpa_intel |
1482 cr3_pcid_proc_intel;
1483 }
1484 }
1485
1486 lcr3(pcb->pcb_cr3);
1487}
1488
1489/*
1490 * pmap_deactivate: deactivate a process' pmap
1491 */
1492
1493void
1494pmap_deactivate(struct proc *p)
1495{
1496 if ((p->p_flag & P_SYSTEM0x00000200) == 0) {
1497 struct cpu_info *self = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})
;
1498
1499 /*
1500 * mark the pmap no longer in use by this processor.
1501 */
1502 KASSERT(self->ci_proc_pmap == p->p_vmspace->vm_map.pmap)((self->ci_proc_pmap == p->p_vmspace->vm_map.pmap) ?
(void)0 : __assert("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c"
, 1502, "self->ci_proc_pmap == p->p_vmspace->vm_map.pmap"
))
;
1503 self->ci_proc_pmap = NULL((void *)0);
1504 }
1505}
1506
1507/*
1508 * end of lifecycle functions
1509 */
1510
1511/*
1512 * some misc. functions
1513 */
1514
1515int
1516pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde)
1517{
1518 int i;
1519 unsigned long index;
1520 pd_entry_t pde;
1521
1522 for (i = PTP_LEVELS4; i > 1; i--) {
1523 index = pl_i(va, i)(((((va) & ~0xffff000000000000)) & ptp_masks[(i)-1]) >>
ptp_shifts[(i)-1])
;
1524 pde = normal_pdes[i - 2][index];
1525 if (!pmap_valid_entry(pde)((pde) & 0x0000000000000001UL))
1526 return 0;
1527 }
1528 if (lastpde != NULL((void *)0))
1529 *lastpde = pde;
1530 return 1;
1531}
1532
1533/*
1534 * pmap_extract: extract a PA for the given VA
1535 */
1536
1537int
1538pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1539{
1540 pt_entry_t *ptes, pte;
1541 int level, offs;
1542
1543 if (pmap == pmap_kernel()(&kernel_pmap_store) && va >= PMAP_DIRECT_BASE(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000)) &&
1544 va < PMAP_DIRECT_END((((((511 - 4) + 4) * (1ULL << 39))) | 0xffff000000000000
))
) {
1545 *pap = va - PMAP_DIRECT_BASE(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000));
1546 return 1;
1547 }
1548
1549 if (pmap != pmap_kernel()(&kernel_pmap_store))
1550 mtx_enter(&pmap->pm_mtx);
1551
1552 level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
1553 pte = ptes[offs];
1554
1555 if (pmap != pmap_kernel()(&kernel_pmap_store))
1556 mtx_leave(&pmap->pm_mtx);
1557
1558 if (__predict_true(level == 0 && pmap_valid_entry(pte))__builtin_expect(((level == 0 && ((pte) & 0x0000000000000001UL
)) != 0), 1)
) {
1559 if (pap != NULL((void *)0))
1560 *pap = (pte & PG_FRAME0x000ffffffffff000UL) | (va & PAGE_MASK((1 << 12) - 1));
1561 return 1;
1562 }
1563 if (level == 1 && (pte & (PG_PS0x0000000000000080UL|PG_V0x0000000000000001UL)) == (PG_PS0x0000000000000080UL|PG_V0x0000000000000001UL)) {
1564 if (pap != NULL((void *)0))
1565 *pap = (pte & PG_LGFRAME0x000fffffffe00000UL) | (va & PAGE_MASK_L2((1ULL << 21) - 1));
1566 return 1;
1567 }
1568
1569 return 0;
1570}
1571
1572/*
1573 * pmap_zero_page: zero a page
1574 */
1575
1576void
1577pmap_zero_page(struct vm_page *pg)
1578{
1579 pagezero(pmap_map_direct(pg)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (((pg)->phys_addr)))
);
1580}
1581
1582/*
1583 * pmap_flush_cache: flush the cache for a virtual address.
1584 */
1585void
1586pmap_flush_cache(vaddr_t addr, vsize_t len)
1587{
1588 vaddr_t i;
1589
1590 if (curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})
->ci_cflushsz == 0) {
1591 wbinvd_on_all_cpus();
1592 return;
1593 }
1594
1595 /* all cpus that have clflush also have mfence. */
1596 mfence();
1597 for (i = addr; i < addr + len; i += curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})
->ci_cflushsz)
1598 clflush(i);
1599 mfence();
1600}
1601
1602/*
1603 * pmap_copy_page: copy a page
1604 */
1605
1606void
1607pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
1608{
1609 vaddr_t srcva = pmap_map_direct(srcpg)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (((srcpg)->phys_addr)))
;
1610 vaddr_t dstva = pmap_map_direct(dstpg)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (((dstpg)->phys_addr)))
;
1611
1612 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE)__builtin_memcpy(((void *)dstva), ((void *)srcva), ((1 <<
12)))
;
1613}
1614
1615/*
1616 * p m a p r e m o v e f u n c t i o n s
1617 *
1618 * functions that remove mappings
1619 */
1620
1621/*
1622 * pmap_remove_ptes: remove PTEs from a PTP
1623 *
1624 * => PTP must be mapped into KVA
1625 * => PTP should be null if pmap == pmap_kernel()
1626 */
1627
1628void
1629pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1630 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs)
1631{
1632 struct pv_entry *pve;
1633 pt_entry_t *pte = (pt_entry_t *) ptpva;
1634 struct vm_page *pg;
1635 pt_entry_t opte;
1636
1637 /*
1638 * note that ptpva points to the PTE that maps startva. this may
1639 * or may not be the first PTE in the PTP.
1640 *
1641 * we loop through the PTP while there are still PTEs to look at
1642 * and the wire_count is greater than 1 (because we use the wire_count
1643 * to keep track of the number of real PTEs in the PTP).
1644 */
1645
1646 for (/*null*/; startva < endva && (ptp == NULL((void *)0) || ptp->wire_count > 1)
1647 ; pte++, startva += PAGE_SIZE(1 << 12)) {
1648 if (!pmap_valid_entry(*pte)((*pte) & 0x0000000000000001UL))
1649 continue; /* VA not mapped */
1650 if ((flags & PMAP_REMOVE_SKIPWIRED1) && (*pte & PG_W0x0000000000000200UL)) {
1651 continue;
1652 }
1653
1654 /* atomically save the old PTE and zap! it */
1655 opte = pmap_pte_set(pte, 0)_atomic_swap_64((pte), (0));
1656
1657 if (opte & PG_W0x0000000000000200UL)
1658 pmap->pm_stats.wired_count--;
1659 pmap->pm_stats.resident_count--;
1660
1661 if (ptp != NULL((void *)0))
1662 ptp->wire_count--; /* dropping a PTE */
1663
1664 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME0x000ffffffffff000UL);
1665
1666 /*
1667 * if we are not on a pv list we are done.
1668 */
1669
1670 if ((opte & PG_PVLIST0x0000000000000400UL) == 0) {
1671#ifdef DIAGNOSTIC1
1672 if (pg != NULL((void *)0))
1673 panic("%s: managed page without PG_PVLIST: "
1674 "va 0x%lx, opte 0x%llx", __func__,
1675 startva, opte);
1676#endif
1677 continue;
1678 }
1679
1680#ifdef DIAGNOSTIC1
1681 if (pg == NULL((void *)0))
1682 panic("%s: unmanaged page marked PG_PVLIST: "
1683 "va 0x%lx, opte 0x%llx", __func__,
1684 startva, opte);
1685#endif
1686
1687 /* sync R/M bits */
1688 pmap_sync_flags_pte(pg, opte);
1689 pve = pmap_remove_pv(pg, pmap, startva);
1690 if (pve != NULL((void *)0)) {
1691 pve->pv_next = *free_pvs;
1692 *free_pvs = pve;
1693 }
1694
1695 /* end of "for" loop: time for next pte */
1696 }
1697}
1698
1699/*
1700 * pmap_remove_pte: remove a single PTE from a PTP
1701 *
1702 * => PTP must be mapped into KVA
1703 * => PTP should be null if pmap == pmap_kernel()
1704 * => returns true if we removed a mapping
1705 */
1706
1707int
1708pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
1709 vaddr_t va, int flags, struct pv_entry **free_pvs)
1710{
1711 struct pv_entry *pve;
1712 struct vm_page *pg;
1713 pt_entry_t opte;
1714
1715 if (!pmap_valid_entry(*pte)((*pte) & 0x0000000000000001UL))
1716 return 0; /* VA not mapped */
1717 if ((flags & PMAP_REMOVE_SKIPWIRED1) && (*pte & PG_W0x0000000000000200UL)) {
1718 return 0;
1719 }
1720
1721 /* atomically save the old PTE and zap! it */
1722 opte = pmap_pte_set(pte, 0)_atomic_swap_64((pte), (0));
1723
1724 if (opte & PG_W0x0000000000000200UL)
1725 pmap->pm_stats.wired_count--;
1726 pmap->pm_stats.resident_count--;
1727
1728 if (ptp != NULL((void *)0))
1729 ptp->wire_count--; /* dropping a PTE */
1730
1731 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME0x000ffffffffff000UL);
1732
1733 /*
1734 * if we are not on a pv list we are done.
1735 */
1736 if ((opte & PG_PVLIST0x0000000000000400UL) == 0) {
1737#ifdef DIAGNOSTIC1
1738 if (pg != NULL((void *)0))
1739 panic("%s: managed page without PG_PVLIST: "
1740 "va 0x%lx, opte 0x%llx", __func__, va, opte);
1741#endif
1742 return 1;
1743 }
1744
1745#ifdef DIAGNOSTIC1
1746 if (pg == NULL((void *)0))
1747 panic("%s: unmanaged page marked PG_PVLIST: "
1748 "va 0x%lx, opte 0x%llx", __func__, va, opte);
1749#endif
1750
1751 /* sync R/M bits */
1752 pmap_sync_flags_pte(pg, opte);
1753 pve = pmap_remove_pv(pg, pmap, va);
1754 if (pve != NULL((void *)0)) {
1755 pve->pv_next = *free_pvs;
1756 *free_pvs = pve;
1757 }
1758
1759 return 1;
1760}
1761
1762/*
1763 * pmap_remove: top level mapping removal function
1764 *
1765 * => caller should not be holding any pmap locks
1766 */
1767
1768void
1769pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1770{
1771 if (pmap->pm_type == PMAP_TYPE_EPT2)
1772 pmap_remove_ept(pmap, sva, eva);
1773 else
1774 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL0);
1775}
1776
1777/*
1778 * pmap_do_remove: mapping removal guts
1779 *
1780 * => caller should not be holding any pmap locks
1781 */
1782
1783void
1784pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
1785{
1786 pd_entry_t pde;
1787 int result;
1788 paddr_t ptppa;
1789 vaddr_t blkendva;
1790 struct vm_page *ptp;
1791 struct pv_entry *pve;
1792 struct pv_entry *free_pvs = NULL((void *)0);
1793 vaddr_t va;
1794 int shootall = 0, shootself;
1795 struct pg_to_free empty_ptps;
1796 paddr_t scr3;
1797
1798 TAILQ_INIT(&empty_ptps)do { (&empty_ptps)->tqh_first = ((void *)0); (&empty_ptps
)->tqh_last = &(&empty_ptps)->tqh_first; } while
(0)
;
1799
1800 scr3 = pmap_map_ptes(pmap);
1801 shootself = (scr3 == 0);
1802
1803 /*
1804 * removing one page? take shortcut function.
1805 */
1806
1807 if (sva + PAGE_SIZE(1 << 12) == eva) {
1808 if (pmap_pdes_valid(sva, &pde)) {
1809
1810 /* PA of the PTP */
1811 ptppa = pde & PG_FRAME0x000ffffffffff000UL;
1812
1813 /* get PTP if non-kernel mapping */
1814
1815 if (pmap == pmap_kernel()(&kernel_pmap_store)) {
1816 /* we never free kernel PTPs */
1817 ptp = NULL((void *)0);
1818 } else {
1819 ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
1820#ifdef DIAGNOSTIC1
1821 if (ptp == NULL((void *)0))
1822 panic("%s: unmanaged PTP detected "
1823 "in shortcut path", __func__);
1824#endif
1825 }
1826
1827 /* do it! */
1828 result = pmap_remove_pte(pmap, ptp,
1829 &PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(sva)(((((sva) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
], sva, flags, &free_pvs);
1830
1831 /*
1832 * if mapping removed and the PTP is no longer
1833 * being used, free it!
1834 */
1835
1836 if (result && ptp && ptp->wire_count <= 1)
1837 pmap_free_ptp(pmap, ptp, sva, &empty_ptps);
1838 pmap_tlb_shootpage(pmap, sva, shootself);
1839 pmap_unmap_ptes(pmap, scr3);
1840 pmap_tlb_shootwait();
1841 } else {
1842 pmap_unmap_ptes(pmap, scr3);
1843 }
1844
1845 goto cleanup;
1846 }
1847
1848 if ((eva - sva > 32 * PAGE_SIZE(1 << 12)) && sva < VM_MIN_KERNEL_ADDRESS0xffff800000000000)
1849 shootall = 1;
1850
1851 for (va = sva; va < eva; va = blkendva) {
1852 /* determine range of block */
1853 blkendva = x86_round_pdr(va + 1)((((unsigned long)(va + 1)) + ((1ULL << 21) - 1)) &
~((1ULL << 21) - 1))
;
1854 if (blkendva > eva)
1855 blkendva = eva;
1856
1857 /*
1858 * XXXCDC: our PTE mappings should never be removed
1859 * with pmap_remove! if we allow this (and why would
1860 * we?) then we end up freeing the pmap's page
1861 * directory page (PDP) before we are finished using
1862 * it when we hit it in the recursive mapping. this
1863 * is BAD.
1864 *
1865 * long term solution is to move the PTEs out of user
1866 * address space. and into kernel address space (up
1867 * with APTE). then we can set VM_MAXUSER_ADDRESS to
1868 * be VM_MAX_ADDRESS.
1869 */
1870
1871 if (pl_i(va, PTP_LEVELS)(((((va) & ~0xffff000000000000)) & ptp_masks[(4)-1]) >>
ptp_shifts[(4)-1])
== PDIR_SLOT_PTE255)
1872 /* XXXCDC: ugly hack to avoid freeing PDP here */
1873 continue;
1874
1875 if (!pmap_pdes_valid(va, &pde))
1876 continue;
1877
1878 /* PA of the PTP */
1879 ptppa = pde & PG_FRAME0x000ffffffffff000UL;
1880
1881 /* get PTP if non-kernel mapping */
1882 if (pmap == pmap_kernel()(&kernel_pmap_store)) {
1883 /* we never free kernel PTPs */
1884 ptp = NULL((void *)0);
1885 } else {
1886 ptp = pmap_find_ptp(pmap, va, ptppa, 1);
1887#ifdef DIAGNOSTIC1
1888 if (ptp == NULL((void *)0))
1889 panic("%s: unmanaged PTP detected", __func__);
1890#endif
1891 }
1892 pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(va)(((((va) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
],
1893 va, blkendva, flags, &free_pvs);
1894
1895 /* if PTP is no longer being used, free it! */
1896 if (ptp && ptp->wire_count <= 1) {
1897 pmap_free_ptp(pmap, ptp, va, &empty_ptps);
1898 }
1899 }
1900
1901 if (shootall)
1902 pmap_tlb_shoottlb(pmap, shootself);
1903 else
1904 pmap_tlb_shootrange(pmap, sva, eva, shootself);
1905
1906 pmap_unmap_ptes(pmap, scr3);
1907 pmap_tlb_shootwait();
1908
1909cleanup:
1910 while ((pve = free_pvs) != NULL((void *)0)) {
1911 free_pvs = pve->pv_next;
1912 pool_put(&pmap_pv_pool, pve);
1913 }
1914
1915 while ((ptp = TAILQ_FIRST(&empty_ptps)((&empty_ptps)->tqh_first)) != NULL((void *)0)) {
1916 TAILQ_REMOVE(&empty_ptps, ptp, pageq)do { if (((ptp)->pageq.tqe_next) != ((void *)0)) (ptp)->
pageq.tqe_next->pageq.tqe_prev = (ptp)->pageq.tqe_prev;
else (&empty_ptps)->tqh_last = (ptp)->pageq.tqe_prev
; *(ptp)->pageq.tqe_prev = (ptp)->pageq.tqe_next; ((ptp
)->pageq.tqe_prev) = ((void *)-1); ((ptp)->pageq.tqe_next
) = ((void *)-1); } while (0)
;
1917 uvm_pagefree(ptp);
1918 }
1919}
1920
1921/*
1922 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1923 *
1924 * => R/M bits are sync'd back to attrs
1925 */
1926
1927void
1928pmap_page_remove(struct vm_page *pg)
1929{
1930 struct pv_entry *pve;
1931 struct pmap *pm;
1932 pt_entry_t opte;
1933#ifdef DIAGNOSTIC1
1934 pd_entry_t pde;
1935#endif
1936 struct pg_to_free empty_ptps;
1937 struct vm_page *ptp;
1938 paddr_t scr3;
1939 int shootself;
1940
1941 TAILQ_INIT(&empty_ptps)do { (&empty_ptps)->tqh_first = ((void *)0); (&empty_ptps
)->tqh_last = &(&empty_ptps)->tqh_first; } while
(0)
;
1942
1943 mtx_enter(&pg->mdpage.pv_mtx);
1944 while ((pve = pg->mdpage.pv_list) != NULL((void *)0)) {
1945 pmap_reference(pve->pv_pmap);
1946 pm = pve->pv_pmap;
1947 mtx_leave(&pg->mdpage.pv_mtx);
1948
1949 /* XXX use direct map? */
1950 scr3 = pmap_map_ptes(pm); /* locks pmap */
1951 shootself = (scr3 == 0);
1952
1953 /*
1954 * We dropped the pvlist lock before grabbing the pmap
1955 * lock to avoid lock ordering problems. This means
1956 * we have to check the pvlist again since somebody
1957 * else might have modified it. All we care about is
1958 * that the pvlist entry matches the pmap we just
1959 * locked. If it doesn't, unlock the pmap and try
1960 * again.
1961 */
1962 mtx_enter(&pg->mdpage.pv_mtx);
1963 if ((pve = pg->mdpage.pv_list) == NULL((void *)0) ||
1964 pve->pv_pmap != pm) {
1965 mtx_leave(&pg->mdpage.pv_mtx);
1966 pmap_unmap_ptes(pm, scr3); /* unlocks pmap */
1967 pmap_destroy(pm);
1968 mtx_enter(&pg->mdpage.pv_mtx);
1969 continue;
1970 }
1971
1972 pg->mdpage.pv_list = pve->pv_next;
1973 mtx_leave(&pg->mdpage.pv_mtx);
1974
1975#ifdef DIAGNOSTIC1
1976 if (pve->pv_ptp != NULL((void *)0) && pmap_pdes_valid(pve->pv_va, &pde) &&
1977 (pde & PG_FRAME0x000ffffffffff000UL) != VM_PAGE_TO_PHYS(pve->pv_ptp)((pve->pv_ptp)->phys_addr)) {
1978 printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__,
1979 pg, pve->pv_va, pve->pv_ptp);
1980 printf("%s: PTP's phys addr: "
1981 "actual=%lx, recorded=%lx\n", __func__,
1982 (unsigned long)(pde & PG_FRAME0x000ffffffffff000UL),
1983 VM_PAGE_TO_PHYS(pve->pv_ptp)((pve->pv_ptp)->phys_addr));
1984 panic("%s: mapped managed page has "
1985 "invalid pv_ptp field", __func__);
1986 }
1987#endif
1988
1989 /* atomically save the old PTE and zap it */
1990 opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0)_atomic_swap_64((&((pt_entry_t *) (255 * (1ULL << 39
)))[(((((pve->pv_va) & ~0xffff000000000000)) & (((
0x0000ff8000000000UL|0x0000007fc0000000UL)|0x000000003fe00000UL
)|0x00000000001ff000UL)) >> 12)]), (0))
;
1991
1992 if (opte & PG_W0x0000000000000200UL)
1993 pve->pv_pmap->pm_stats.wired_count--;
1994 pve->pv_pmap->pm_stats.resident_count--;
1995
1996 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself);
1997
1998 pmap_sync_flags_pte(pg, opte);
1999
2000 /* update the PTP reference count. free if last reference. */
2001 if (pve->pv_ptp != NULL((void *)0)) {
2002 pve->pv_ptp->wire_count--;
2003 if (pve->pv_ptp->wire_count <= 1) {
2004 pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
2005 pve->pv_va, &empty_ptps);
2006 }
2007 }
2008 pmap_unmap_ptes(pve->pv_pmap, scr3); /* unlocks pmap */
2009 pmap_destroy(pve->pv_pmap);
2010 pool_put(&pmap_pv_pool, pve);
2011 mtx_enter(&pg->mdpage.pv_mtx);
2012 }
2013 mtx_leave(&pg->mdpage.pv_mtx);
2014
2015 pmap_tlb_shootwait();
2016
2017 while ((ptp = TAILQ_FIRST(&empty_ptps)((&empty_ptps)->tqh_first)) != NULL((void *)0)) {
2018 TAILQ_REMOVE(&empty_ptps, ptp, pageq)do { if (((ptp)->pageq.tqe_next) != ((void *)0)) (ptp)->
pageq.tqe_next->pageq.tqe_prev = (ptp)->pageq.tqe_prev;
else (&empty_ptps)->tqh_last = (ptp)->pageq.tqe_prev
; *(ptp)->pageq.tqe_prev = (ptp)->pageq.tqe_next; ((ptp
)->pageq.tqe_prev) = ((void *)-1); ((ptp)->pageq.tqe_next
) = ((void *)-1); } while (0)
;
2019 uvm_pagefree(ptp);
2020 }
2021}
2022
2023/*
2024 * p m a p a t t r i b u t e f u n c t i o n s
2025 * functions that test/change managed page's attributes
2026 * since a page can be mapped multiple times we must check each PTE that
2027 * maps it by going down the pv lists.
2028 */
2029
2030/*
2031 * pmap_test_attrs: test a page's attributes
2032 */
2033
2034int
2035pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
2036{
2037 struct pv_entry *pve;
2038 pt_entry_t *ptes;
2039 int level, offs;
2040 u_long mybits, testflags;
2041
2042 testflags = pmap_pte2flags(testbits);
2043
2044 if (pg->pg_flags & testflags)
2045 return 1;
2046
2047 mybits = 0;
2048 mtx_enter(&pg->mdpage.pv_mtx);
2049 for (pve = pg->mdpage.pv_list; pve != NULL((void *)0) && mybits == 0;
2050 pve = pve->pv_next) {
2051 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
2052 &offs);
2053 mybits |= (ptes[offs] & testbits);
2054 }
2055 mtx_leave(&pg->mdpage.pv_mtx);
2056
2057 if (mybits == 0)
2058 return 0;
2059
2060 atomic_setbits_intx86_atomic_setbits_u32(&pg->pg_flags, pmap_pte2flags(mybits));
2061
2062 return 1;
2063}
2064
2065/*
2066 * pmap_clear_attrs: change a page's attributes
2067 *
2068 * => we return 1 if we cleared one of the bits we were asked to
2069 */
2070
2071int
2072pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
2073{
2074 struct pv_entry *pve;
2075 pt_entry_t *ptes, opte;
2076 u_long clearflags;
2077 int result, level, offs;
2078
2079 clearflags = pmap_pte2flags(clearbits);
2080
2081 result = pg->pg_flags & clearflags;
2082 if (result)
2083 atomic_clearbits_intx86_atomic_clearbits_u32(&pg->pg_flags, clearflags);
2084
2085 mtx_enter(&pg->mdpage.pv_mtx);
2086 for (pve = pg->mdpage.pv_list; pve != NULL((void *)0); pve = pve->pv_next) {
2087 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes,
Value stored to 'level' is never read
2088 &offs);
2089 opte = ptes[offs];
2090 if (opte & clearbits) {
2091 result = 1;
2092 pmap_pte_clearbits(&ptes[offs], (opte & clearbits))x86_atomic_clearbits_u64(&ptes[offs], (opte & clearbits
))
;
2093 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va,
2094 pmap_is_curpmap(pve->pv_pmap));
2095 }
2096 }
2097 mtx_leave(&pg->mdpage.pv_mtx);
2098
2099 pmap_tlb_shootwait();
2100
2101 return (result != 0);
2102}
2103
2104/*
2105 * p m a p p r o t e c t i o n f u n c t i o n s
2106 */
2107
2108/*
2109 * pmap_page_protect: change the protection of all recorded mappings
2110 * of a managed page
2111 *
2112 * => NOTE: this is an inline function in pmap.h
2113 */
2114
2115/* see pmap.h */
2116
2117/*
2118 * pmap_protect: set the protection in of the pages in a pmap
2119 *
2120 * => NOTE: this is an inline function in pmap.h
2121 */
2122
2123/* see pmap.h */
2124
2125/*
2126 * pmap_write_protect: write-protect pages in a pmap
2127 */
2128
2129void
2130pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2131{
2132 pt_entry_t *spte, *epte;
2133 pt_entry_t clear = 0, set = 0;
2134 vaddr_t blockend;
2135 int shootall = 0, shootself;
2136 vaddr_t va;
2137 paddr_t scr3;
2138
2139 scr3 = pmap_map_ptes(pmap);
2140 shootself = (scr3 == 0);
2141
2142 /* should be ok, but just in case ... */
2143 sva &= PG_FRAME0x000ffffffffff000UL;
2144 eva &= PG_FRAME0x000ffffffffff000UL;
2145
2146 if (!(prot & PROT_READ0x01))
2147 set |= pg_xo;
2148 if (!(prot & PROT_WRITE0x02))
2149 clear = PG_RW0x0000000000000002UL;
2150 if (!(prot & PROT_EXEC0x04))
2151 set |= pg_nx;
2152
2153 if ((eva - sva > 32 * PAGE_SIZE(1 << 12)) && sva < VM_MIN_KERNEL_ADDRESS0xffff800000000000)
2154 shootall = 1;
2155
2156 for (va = sva; va < eva ; va = blockend) {
2157 blockend = (va & L2_FRAME((0x0000ff8000000000UL|0x0000007fc0000000UL)|0x000000003fe00000UL
)
) + NBPD_L2(1ULL << 21);
2158 if (blockend > eva)
2159 blockend = eva;
2160
2161 /*
2162 * XXXCDC: our PTE mappings should never be write-protected!
2163 *
2164 * long term solution is to move the PTEs out of user
2165 * address space. and into kernel address space (up
2166 * with APTE). then we can set VM_MAXUSER_ADDRESS to
2167 * be VM_MAX_ADDRESS.
2168 */
2169
2170 /* XXXCDC: ugly hack to avoid freeing PDP here */
2171 if (pl_i(va, PTP_LEVELS)(((((va) & ~0xffff000000000000)) & ptp_masks[(4)-1]) >>
ptp_shifts[(4)-1])
== PDIR_SLOT_PTE255)
2172 continue;
2173
2174 /* empty block? */
2175 if (!pmap_pdes_valid(va, NULL((void *)0)))
2176 continue;
2177
2178#ifdef DIAGNOSTIC1
2179 if (va >= VM_MAXUSER_ADDRESS0x00007f7fffffc000 && va < VM_MAX_ADDRESS0x00007fbfdfeff000)
2180 panic("%s: PTE space", __func__);
2181#endif
2182
2183 spte = &PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(va)(((((va) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
];
2184 epte = &PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(blockend)(((((blockend) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
];
2185
2186 for (/*null */; spte < epte ; spte++) {
2187 if (!pmap_valid_entry(*spte)((*spte) & 0x0000000000000001UL))
2188 continue;
2189 pmap_pte_clearbits(spte, clear)x86_atomic_clearbits_u64(spte, clear);
2190 pmap_pte_setbits(spte, set)x86_atomic_setbits_u64(spte, set);
2191 }
2192 }
2193
2194 if (shootall)
2195 pmap_tlb_shoottlb(pmap, shootself);
2196 else
2197 pmap_tlb_shootrange(pmap, sva, eva, shootself);
2198
2199 pmap_unmap_ptes(pmap, scr3);
2200 pmap_tlb_shootwait();
2201}
2202
2203/*
2204 * end of protection functions
2205 */
2206
2207/*
2208 * pmap_unwire: clear the wired bit in the PTE
2209 *
2210 * => mapping should already be in map
2211 */
2212
2213void
2214pmap_unwire(struct pmap *pmap, vaddr_t va)
2215{
2216 pt_entry_t *ptes;
2217 int level, offs;
2218
2219 level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2220
2221 if (level == 0) {
2222
2223#ifdef DIAGNOSTIC1
2224 if (!pmap_valid_entry(ptes[offs])((ptes[offs]) & 0x0000000000000001UL))
2225 panic("%s: invalid (unmapped) va 0x%lx", __func__, va);
2226#endif
2227 if (__predict_true((ptes[offs] & PG_W) != 0)__builtin_expect((((ptes[offs] & 0x0000000000000200UL) !=
0) != 0), 1)
) {
2228 pmap_pte_clearbits(&ptes[offs], PG_W)x86_atomic_clearbits_u64(&ptes[offs], 0x0000000000000200UL
)
;
2229 pmap->pm_stats.wired_count--;
2230 }
2231#ifdef DIAGNOSTIC1
2232 else {
2233 printf("%s: wiring for pmap %p va 0x%lx "
2234 "didn't change!\n", __func__, pmap, va);
2235 }
2236#endif
2237 }
2238#ifdef DIAGNOSTIC1
2239 else {
2240 panic("%s: invalid PDE", __func__);
2241 }
2242#endif
2243}
2244
2245#if 0
2246/*
2247 * pmap_collect: free resources held by a pmap
2248 *
2249 * => optional function.
2250 * => called when a process is swapped out to free memory.
2251 */
2252
2253void
2254pmap_collect(struct pmap *pmap)
2255{
2256 /*
2257 * free all of the pt pages by removing the physical mappings
2258 * for its entire address space.
2259 */
2260
2261 pmap_do_remove(pmap, VM_MIN_ADDRESS(1 << 12), VM_MAX_ADDRESS0x00007fbfdfeff000,
2262 PMAP_REMOVE_SKIPWIRED1);
2263}
2264#endif
2265
2266void
2267pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot)
2268{
2269 uint64_t l4idx, l3idx, l2idx, l1idx;
2270 pd_entry_t *pd, *ptp;
2271 paddr_t npa;
2272 struct pmap *pmap = pmap_kernel()(&kernel_pmap_store);
2273 pt_entry_t *ptes;
2274 int level, offs;
2275
2276 /* If CPU is secure, no need to do anything */
2277 if (!cpu_meltdown)
2278 return;
2279
2280 /* Must be kernel VA */
2281 if (va < VM_MIN_KERNEL_ADDRESS0xffff800000000000)
2282 panic("%s: invalid special mapping va 0x%lx requested",
2283 __func__, va);
2284
2285 if (pmap->pm_pdir_intel == NULL((void *)0))
2286 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool,
2287 PR_WAITOK0x0001 | PR_ZERO0x0008);
2288
2289 l4idx = (va & L4_MASK0x0000ff8000000000UL) >> L4_SHIFT39; /* PML4E idx */
2290 l3idx = (va & L3_MASK0x0000007fc0000000UL) >> L3_SHIFT30; /* PDPTE idx */
2291 l2idx = (va & L2_MASK0x000000003fe00000UL) >> L2_SHIFT21; /* PDE idx */
2292 l1idx = (va & L1_MASK0x00000000001ff000UL) >> L1_SHIFT12; /* PTE idx */
2293
2294 DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld "
2295 "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va,
2296 (uint64_t)pa, l4idx, l3idx, l2idx, l1idx);
2297
2298 /* Start at PML4 / top level */
2299 pd = pmap->pm_pdir_intel;
2300
2301 if (pd == NULL((void *)0))
2302 panic("%s: PML4 not initialized for pmap @ %p", __func__,
2303 pmap);
2304
2305 /* npa = physaddr of PDPT */
2306 npa = pd[l4idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2307
2308 /* Valid PML4e for the 512GB region containing va? */
2309 if (!npa) {
2310 /* No valid PML4E - allocate PDPT page and set PML4E */
2311
2312 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK0x0001 | PR_ZERO0x0008);
2313
2314 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2315 panic("%s: can't locate PDPT page", __func__);
2316
2317 pd[l4idx] = (npa | PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL);
2318
2319 DPRINTF("%s: allocated new PDPT page at phys 0x%llx, "
2320 "setting PML4e[%lld] = 0x%llx\n", __func__,
2321 (uint64_t)npa, l4idx, pd[l4idx]);
2322 }
2323
2324 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa))
;
2325 if (pd == NULL((void *)0))
2326 panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2327 (uint64_t)npa);
2328
2329 /* npa = physaddr of PD page */
2330 npa = pd[l3idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2331
2332 /* Valid PDPTe for the 1GB region containing va? */
2333 if (!npa) {
2334 /* No valid PDPTe - allocate PD page and set PDPTe */
2335
2336 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK0x0001 | PR_ZERO0x0008);
2337
2338 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2339 panic("%s: can't locate PD page", __func__);
2340
2341 pd[l3idx] = (npa | PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL);
2342
2343 DPRINTF("%s: allocated new PD page at phys 0x%llx, "
2344 "setting PDPTe[%lld] = 0x%llx\n", __func__,
2345 (uint64_t)npa, l3idx, pd[l3idx]);
2346 }
2347
2348 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa))
;
2349 if (pd == NULL((void *)0))
2350 panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2351 (uint64_t)npa);
2352
2353 /* npa = physaddr of PT page */
2354 npa = pd[l2idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2355
2356 /* Valid PDE for the 2MB region containing va? */
2357 if (!npa) {
2358 /* No valid PDE - allocate PT page and set PDE */
2359
2360 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK0x0001 | PR_ZERO0x0008);
2361
2362 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa))
2363 panic("%s: can't locate PT page", __func__);
2364
2365 pd[l2idx] = (npa | PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL);
2366
2367 DPRINTF("%s: allocated new PT page at phys 0x%llx, "
2368 "setting PDE[%lld] = 0x%llx\n", __func__,
2369 (uint64_t)npa, l2idx, pd[l2idx]);
2370 }
2371
2372 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa))
;
2373 if (pd == NULL((void *)0))
2374 panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2375 (uint64_t)npa);
2376
2377 DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot "
2378 "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd,
2379 (uint64_t)prot, (uint64_t)pd[l1idx]);
2380
2381 pd[l1idx] = pa | protection_codes[prot] | PG_V0x0000000000000001UL | PG_W0x0000000000000200UL;
2382
2383 /*
2384 * Look up the corresponding U+K entry. If we're installing the
2385 * same PA into the U-K map then set the PG_G bit on both and copy
2386 * the cache-control bits from the U+K entry to the U-K entry.
2387 */
2388 level = pmap_find_pte_direct(pmap, va, &ptes, &offs);
2389 if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))__builtin_expect(((level == 0 && ((ptes[offs]) & 0x0000000000000001UL
)) != 0), 1)
) {
2390 if (((pd[l1idx] ^ ptes[offs]) & PG_FRAME0x000ffffffffff000UL) == 0) {
2391 pd[l1idx] |= PG_G0x0000000000000100UL | (ptes[offs] & (PG_N0x0000000000000010UL | PG_WT0x0000000000000008UL));
2392 ptes[offs] |= PG_G0x0000000000000100UL;
2393 } else {
2394 DPRINTF("%s: special diffing mapping at %llx\n",
2395 __func__, (long long)va);
2396 }
2397 } else
2398 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__);
2399
2400 DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]);
2401}
2402
2403void
2404pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
2405{
2406 vaddr_t v;
2407#if NVMM1 > 0
2408 struct vmx_invept_descriptor vid;
2409#endif /* NVMM > 0 */
2410
2411 DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
2412 (uint64_t)egpa);
2413 for (v = sgpa; v < egpa + PAGE_SIZE(1 << 12); v += PAGE_SIZE(1 << 12))
2414 pmap_do_remove_ept(pmap, v);
2415
2416#if NVMM1 > 0
2417 if (pmap->eptp != 0) {
2418 memset(&vid, 0, sizeof(vid))__builtin_memset((&vid), (0), (sizeof(vid)));
2419 vid.vid_eptp = pmap->eptp;
2420 DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__,
2421 vid.vid_eptp);
2422 invept(IA32_VMX_INVEPT_SINGLE_CTX0x1, &vid);
2423 }
2424#endif /* NVMM > 0 */
2425}
2426
2427void
2428pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
2429{
2430 uint64_t l4idx, l3idx, l2idx, l1idx;
2431 struct vm_page *pg3, *pg2, *pg1;
2432 paddr_t npa3, npa2, npa1;
2433 pd_entry_t *pd4, *pd3, *pd2, *pd1;
2434 pd_entry_t *pptes;
2435
2436 l4idx = (gpa & L4_MASK0x0000ff8000000000UL) >> L4_SHIFT39; /* PML4E idx */
2437 l3idx = (gpa & L3_MASK0x0000007fc0000000UL) >> L3_SHIFT30; /* PDPTE idx */
2438 l2idx = (gpa & L2_MASK0x000000003fe00000UL) >> L2_SHIFT21; /* PDE idx */
2439 l1idx = (gpa & L1_MASK0x00000000001ff000UL) >> L1_SHIFT12; /* PTE idx */
2440
2441 /* Start at PML4 / top level */
2442 pd4 = (pd_entry_t *)pmap->pm_pdir;
2443
2444 if (pd4 == NULL((void *)0))
2445 return;
2446
2447 /* npa3 = physaddr of PDPT */
2448 npa3 = pd4[l4idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2449 if (!npa3)
2450 return;
2451 pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa3))
;
2452 pg3 = PHYS_TO_VM_PAGE(npa3);
2453
2454 /* npa2 = physaddr of PD page */
2455 npa2 = pd3[l3idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2456 if (!npa2)
2457 return;
2458 pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa2))
;
2459 pg2 = PHYS_TO_VM_PAGE(npa2);
2460
2461 /* npa1 = physaddr of PT page */
2462 npa1 = pd2[l2idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2463 if (!npa1)
2464 return;
2465 pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa1))
;
2466 pg1 = PHYS_TO_VM_PAGE(npa1);
2467
2468 if (pd1[l1idx] == 0)
2469 return;
2470
2471 pd1[l1idx] = 0;
2472 pg1->wire_count--;
2473 pmap->pm_stats.resident_count--;
2474
2475 if (pg1->wire_count > 1)
2476 return;
2477
2478 pg1->wire_count = 0;
2479 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa2))
;
2480 pptes[l2idx] = 0;
2481 uvm_pagefree(pg1);
2482 pmap->pm_stats.resident_count--;
2483
2484 pg2->wire_count--;
2485 if (pg2->wire_count > 1)
2486 return;
2487
2488 pg2->wire_count = 0;
2489 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa3))
;
2490 pptes[l3idx] = 0;
2491 uvm_pagefree(pg2);
2492 pmap->pm_stats.resident_count--;
2493
2494 pg3->wire_count--;
2495 if (pg3->wire_count > 1)
2496 return;
2497
2498 pg3->wire_count = 0;
2499 pptes = pd4;
2500 pptes[l4idx] = 0;
2501 uvm_pagefree(pg3);
2502 pmap->pm_stats.resident_count--;
2503}
2504
2505int
2506pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
2507{
2508 uint64_t l4idx, l3idx, l2idx, l1idx;
2509 pd_entry_t *pd, npte;
2510 struct vm_page *ptp, *pptp;
2511 paddr_t npa;
2512 struct uvm_object *obj;
2513
2514 if (gpa > MAXDSIZ((paddr_t)128*1024*1024*1024))
2515 return ENOMEM12;
2516
2517 l4idx = (gpa & L4_MASK0x0000ff8000000000UL) >> L4_SHIFT39; /* PML4E idx */
2518 l3idx = (gpa & L3_MASK0x0000007fc0000000UL) >> L3_SHIFT30; /* PDPTE idx */
2519 l2idx = (gpa & L2_MASK0x000000003fe00000UL) >> L2_SHIFT21; /* PDE idx */
2520 l1idx = (gpa & L1_MASK0x00000000001ff000UL) >> L1_SHIFT12; /* PTE idx */
2521
2522 /* Start at PML4 / top level */
2523 pd = (pd_entry_t *)pmap->pm_pdir;
2524
2525 if (pd == NULL((void *)0))
2526 return ENOMEM12;
2527
2528 /* npa = physaddr of PDPT */
2529 npa = pd[l4idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2530
2531 /* Valid PML4e for the 512GB region containing gpa? */
2532 if (!npa) {
2533 /* No valid PML4e - allocate PDPT page and set PML4e */
2534 obj = &pmap->pm_obj[2]; /* PML4 UVM object */
2535 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3)((((((gpa) & ~0xffff000000000000)) & ptp_masks[((3)+1
)-1]) >> ptp_shifts[((3)+1)-1]) * (1 << 12))
, NULL((void *)0),
2536 UVM_PGA_USERESERVE0x0001|UVM_PGA_ZERO0x0002);
2537
2538 if (ptp == NULL((void *)0))
2539 return ENOMEM12;
2540
2541 /*
2542 * New PDPT page - we are setting the first entry, so set
2543 * the wired count to 1
2544 */
2545 ptp->wire_count = 1;
2546
2547 /* Calculate phys address of this new PDPT page */
2548 npa = VM_PAGE_TO_PHYS(ptp)((ptp)->phys_addr);
2549
2550 /*
2551 * Higher levels get full perms; specific permissions are
2552 * entered at the lowest level.
2553 */
2554 pd[l4idx] = (npa | EPT_R(1ULL << 0) | EPT_W(1ULL << 1) | EPT_X(1ULL << 2));
2555
2556 pmap->pm_stats.resident_count++;
2557
2558 pptp = ptp;
2559 } else {
2560 /* Already allocated PML4e */
2561 pptp = PHYS_TO_VM_PAGE(npa);
2562 }
2563
2564 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa))
;
2565 if (pd == NULL((void *)0))
2566 panic("%s: can't locate PDPT @ pa=0x%llx", __func__,
2567 (uint64_t)npa);
2568
2569 /* npa = physaddr of PD page */
2570 npa = pd[l3idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2571
2572 /* Valid PDPTe for the 1GB region containing gpa? */
2573 if (!npa) {
2574 /* No valid PDPTe - allocate PD page and set PDPTe */
2575 obj = &pmap->pm_obj[1]; /* PDPT UVM object */
2576 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2)((((((gpa) & ~0xffff000000000000)) & ptp_masks[((2)+1
)-1]) >> ptp_shifts[((2)+1)-1]) * (1 << 12))
, NULL((void *)0),
2577 UVM_PGA_USERESERVE0x0001|UVM_PGA_ZERO0x0002);
2578
2579 if (ptp == NULL((void *)0))
2580 return ENOMEM12;
2581
2582 /*
2583 * New PD page - we are setting the first entry, so set
2584 * the wired count to 1
2585 */
2586 ptp->wire_count = 1;
2587 pptp->wire_count++;
2588
2589 npa = VM_PAGE_TO_PHYS(ptp)((ptp)->phys_addr);
2590
2591 /*
2592 * Higher levels get full perms; specific permissions are
2593 * entered at the lowest level.
2594 */
2595 pd[l3idx] = (npa | EPT_R(1ULL << 0) | EPT_W(1ULL << 1) | EPT_X(1ULL << 2));
2596
2597 pmap->pm_stats.resident_count++;
2598
2599 pptp = ptp;
2600 } else {
2601 /* Already allocated PDPTe */
2602 pptp = PHYS_TO_VM_PAGE(npa);
2603 }
2604
2605 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa))
;
2606 if (pd == NULL((void *)0))
2607 panic("%s: can't locate PD page @ pa=0x%llx", __func__,
2608 (uint64_t)npa);
2609
2610 /* npa = physaddr of PT page */
2611 npa = pd[l2idx] & PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2612
2613 /* Valid PDE for the 2MB region containing gpa? */
2614 if (!npa) {
2615 /* No valid PDE - allocate PT page and set PDE */
2616 obj = &pmap->pm_obj[0]; /* PDE UVM object */
2617 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1)((((((gpa) & ~0xffff000000000000)) & ptp_masks[((1)+1
)-1]) >> ptp_shifts[((1)+1)-1]) * (1 << 12))
, NULL((void *)0),
2618 UVM_PGA_USERESERVE0x0001|UVM_PGA_ZERO0x0002);
2619
2620 if (ptp == NULL((void *)0))
2621 return ENOMEM12;
2622
2623 pptp->wire_count++;
2624
2625 npa = VM_PAGE_TO_PHYS(ptp)((ptp)->phys_addr);
2626
2627 /*
2628 * Higher level get full perms; specific permissions are
2629 * entered at the lowest level.
2630 */
2631 pd[l2idx] = (npa | EPT_R(1ULL << 0) | EPT_W(1ULL << 1) | EPT_X(1ULL << 2));
2632
2633 pmap->pm_stats.resident_count++;
2634
2635 } else {
2636 /* Find final ptp */
2637 ptp = PHYS_TO_VM_PAGE(npa);
2638 if (ptp == NULL((void *)0))
2639 panic("%s: ptp page vanished?", __func__);
2640 }
2641
2642 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (npa))
;
2643 if (pd == NULL((void *)0))
2644 panic("%s: can't locate PT page @ pa=0x%llx", __func__,
2645 (uint64_t)npa);
2646
2647 npte = hpa | EPT_WB(6ULL << 3);
2648 if (prot & PROT_READ0x01)
2649 npte |= EPT_R(1ULL << 0);
2650 if (prot & PROT_WRITE0x02)
2651 npte |= EPT_W(1ULL << 1);
2652 if (prot & PROT_EXEC0x04)
2653 npte |= EPT_X(1ULL << 2);
2654
2655 if (pd[l1idx] == 0) {
2656 ptp->wire_count++;
2657 pmap->pm_stats.resident_count++;
2658 } else {
2659 /* XXX flush ept */
2660 }
2661
2662 pd[l1idx] = npte;
2663
2664 return 0;
2665}
2666
2667/*
2668 * pmap_enter: enter a mapping into a pmap
2669 *
2670 * => must be done "now" ... no lazy-evaluation
2671 */
2672
2673int
2674pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
2675{
2676 pt_entry_t opte, npte;
2677 struct vm_page *ptp, *pg = NULL((void *)0);
2678 struct pv_entry *pve, *opve = NULL((void *)0);
2679 int ptpdelta, wireddelta, resdelta;
2680 int wired = (flags & PMAP_WIRED0x00000010) != 0;
2681 int nocache = (pa & PMAP_NOCACHE0x1) != 0;
2682 int wc = (pa & PMAP_WC0x2) != 0;
2683 int error, shootself;
2684 paddr_t scr3;
2685
2686 if (pmap->pm_type == PMAP_TYPE_EPT2)
2687 return pmap_enter_ept(pmap, va, pa, prot);
2688
2689 KASSERT(!(wc && nocache))((!(wc && nocache)) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/arch/amd64/amd64/pmap.c", 2689, "!(wc && nocache)"
))
;
2690 pa &= PMAP_PA_MASK~((paddr_t)((1 << 12) - 1));
2691
2692#ifdef DIAGNOSTIC1
2693 if (va == (vaddr_t) PDP_BASE((pd_entry_t *)((char *)((pd_entry_t *)((char *)((pd_entry_t *
)((char *)((pt_entry_t *) (255 * (1ULL << 39))) + 255 *
(1ULL << 30))) + 255 * (1ULL << 21))) + 255 * (1ULL
<< 12)))
)
2694 panic("%s: trying to map over PDP!", __func__);
2695
2696 /* sanity check: kernel PTPs should already have been pre-allocated */
2697 if (va >= VM_MIN_KERNEL_ADDRESS0xffff800000000000 &&
2698 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])((pmap->pm_pdir[(((((va) & ~0xffff000000000000)) &
ptp_masks[(4)-1]) >> ptp_shifts[(4)-1])]) & 0x0000000000000001UL
)
)
2699 panic("%s: missing kernel PTP for va %lx!", __func__, va);
2700
2701#endif
2702
2703 pve = pool_get(&pmap_pv_pool, PR_NOWAIT0x0002);
2704 if (pve == NULL((void *)0)) {
2705 if (flags & PMAP_CANFAIL0x00000020) {
2706 error = ENOMEM12;
2707 goto out;
2708 }
2709 panic("%s: no pv entries available", __func__);
2710 }
2711
2712 /*
2713 * map in ptes and get a pointer to our PTP (unless we are the kernel)
2714 */
2715
2716 scr3 = pmap_map_ptes(pmap);
2717 shootself = (scr3 == 0);
2718 if (pmap == pmap_kernel()(&kernel_pmap_store)) {
2719 ptp = NULL((void *)0);
2720 } else {
2721 ptp = pmap_get_ptp(pmap, va);
2722 if (ptp == NULL((void *)0)) {
2723 if (flags & PMAP_CANFAIL0x00000020) {
2724 pmap_unmap_ptes(pmap, scr3);
2725 error = ENOMEM12;
2726 goto out;
2727 }
2728 panic("%s: get ptp failed", __func__);
2729 }
2730 }
2731 opte = PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(va)(((((va) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
]; /* old PTE */
2732
2733 /*
2734 * is there currently a valid mapping at our VA?
2735 */
2736
2737 if (pmap_valid_entry(opte)((opte) & 0x0000000000000001UL)) {
2738 /*
2739 * first, calculate pm_stats updates. resident count will not
2740 * change since we are replacing/changing a valid mapping.
2741 * wired count might change...
2742 */
2743
2744 resdelta = 0;
2745 if (wired && (opte & PG_W0x0000000000000200UL) == 0)
2746 wireddelta = 1;
2747 else if (!wired && (opte & PG_W0x0000000000000200UL) != 0)
2748 wireddelta = -1;
2749 else
2750 wireddelta = 0;
2751 ptpdelta = 0;
2752
2753 /*
2754 * is the currently mapped PA the same as the one we
2755 * want to map?
2756 */
2757
2758 if ((opte & PG_FRAME0x000ffffffffff000UL) == pa) {
2759
2760 /* if this is on the PVLIST, sync R/M bit */
2761 if (opte & PG_PVLIST0x0000000000000400UL) {
2762 pg = PHYS_TO_VM_PAGE(pa);
2763#ifdef DIAGNOSTIC1
2764 if (pg == NULL((void *)0))
2765 panic("%s: same pa, PG_PVLIST "
2766 "mapping with unmanaged page: "
2767 "va 0x%lx, opte 0x%llx, pa 0x%lx",
2768 __func__, va, opte, pa);
2769#endif
2770 pmap_sync_flags_pte(pg, opte);
2771 } else {
2772#ifdef DIAGNOSTIC1
2773 if (PHYS_TO_VM_PAGE(pa) != NULL((void *)0))
2774 panic("%s: same pa, no PG_PVLIST "
2775 "mapping with managed page: "
2776 "va 0x%lx, opte 0x%llx, pa 0x%lx",
2777 __func__, va, opte, pa);
2778#endif
2779 }
2780 goto enter_now;
2781 }
2782
2783 /*
2784 * changing PAs: we must remove the old one first
2785 */
2786
2787 /*
2788 * if current mapping is on a pvlist,
2789 * remove it (sync R/M bits)
2790 */
2791
2792 if (opte & PG_PVLIST0x0000000000000400UL) {
2793 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME0x000ffffffffff000UL);
2794#ifdef DIAGNOSTIC1
2795 if (pg == NULL((void *)0))
2796 panic("%s: PG_PVLIST mapping with unmanaged "
2797 "page: va 0x%lx, opte 0x%llx, pa 0x%lx",
2798 __func__, va, opte, pa);
2799#endif
2800 pmap_sync_flags_pte(pg, opte);
2801 opve = pmap_remove_pv(pg, pmap, va);
2802 pg = NULL((void *)0); /* This is not the page we are looking for */
2803 }
2804 } else { /* opte not valid */
2805 resdelta = 1;
2806 if (wired)
2807 wireddelta = 1;
2808 else
2809 wireddelta = 0;
2810 if (ptp != NULL((void *)0))
2811 ptpdelta = 1;
2812 else
2813 ptpdelta = 0;
2814 }
2815
2816 /*
2817 * pve is either NULL or points to a now-free pv_entry structure
2818 * (the latter case is if we called pmap_remove_pv above).
2819 *
2820 * if this entry is to be on a pvlist, enter it now.
2821 */
2822
2823 if (pmap_initialized)
2824 pg = PHYS_TO_VM_PAGE(pa);
2825
2826 if (pg != NULL((void *)0)) {
2827 pmap_enter_pv(pg, pve, pmap, va, ptp);
2828 pve = NULL((void *)0);
2829 }
2830
2831enter_now:
2832 /*
2833 * at this point pg is !NULL if we want the PG_PVLIST bit set
2834 */
2835
2836 pmap->pm_stats.resident_count += resdelta;
2837 pmap->pm_stats.wired_count += wireddelta;
2838 if (ptp != NULL((void *)0))
2839 ptp->wire_count += ptpdelta;
2840
2841 KASSERT(pg == PHYS_TO_VM_PAGE(pa))((pg == PHYS_TO_VM_PAGE(pa)) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/arch/amd64/amd64/pmap.c", 2841, "pg == PHYS_TO_VM_PAGE(pa)"
))
;
2842
2843 npte = pa | protection_codes[prot] | PG_V0x0000000000000001UL;
2844 if (pg != NULL((void *)0)) {
2845 npte |= PG_PVLIST0x0000000000000400UL;
2846 /*
2847 * make sure that if the page is write combined all
2848 * instances of pmap_enter make it so.
2849 */
2850 if (pg->pg_flags & PG_PMAP_WC0x04000000) {
2851 KASSERT(nocache == 0)((nocache == 0) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/arch/amd64/amd64/pmap.c"
, 2851, "nocache == 0"))
;
2852 wc = 1;
2853 }
2854 }
2855 if (wc)
2856 npte |= pmap_pg_wc;
2857 if (wired)
2858 npte |= PG_W0x0000000000000200UL;
2859 if (nocache)
2860 npte |= PG_N0x0000000000000010UL;
2861 if (va < VM_MAXUSER_ADDRESS0x00007f7fffffc000)
2862 npte |= ((flags & PMAP_EFI0x00000040) ? 0 : PG_u0x0000000000000004UL);
2863 else if (va < VM_MAX_ADDRESS0x00007fbfdfeff000)
2864 npte |= (PG_u0x0000000000000004UL | PG_RW0x0000000000000002UL); /* XXXCDC: no longer needed? */
2865 if (pmap == pmap_kernel()(&kernel_pmap_store))
2866 npte |= pg_g_kern;
2867
2868 /*
2869 * If the old entry wasn't valid, we can just update it and
2870 * go. If it was valid, and this isn't a read->write
2871 * transition, then we can safely just update it and flush
2872 * any old TLB entries.
2873 *
2874 * If it _was_ valid and this _is_ a read->write transition,
2875 * then this could be a CoW resolution and we need to make
2876 * sure no CPU can see the new writable mapping while another
2877 * still has the old mapping in its TLB, so insert a correct
2878 * but unwritable mapping, flush any old TLB entries, then
2879 * make it writable.
2880 */
2881 if (! pmap_valid_entry(opte)((opte) & 0x0000000000000001UL)) {
2882 PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(va)(((((va) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
] = npte;
2883 } else if ((opte | (npte ^ PG_RW0x0000000000000002UL)) & PG_RW0x0000000000000002UL) {
2884 /* previously writable or not making writable */
2885 PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(va)(((((va) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
] = npte;
2886 if (nocache && (opte & PG_N0x0000000000000010UL) == 0)
2887 wbinvd_on_all_cpus();
2888 pmap_tlb_shootpage(pmap, va, shootself);
2889 } else {
2890 PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(va)(((((va) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
] = npte ^ PG_RW0x0000000000000002UL;
2891 if (nocache && (opte & PG_N0x0000000000000010UL) == 0) /* XXX impossible? */
2892 wbinvd_on_all_cpus();
2893 pmap_tlb_shootpage(pmap, va, shootself);
2894 pmap_tlb_shootwait();
2895 PTE_BASE((pt_entry_t *) (255 * (1ULL << 39)))[pl1_i(va)(((((va) & ~0xffff000000000000)) & (((0x0000ff8000000000UL
|0x0000007fc0000000UL)|0x000000003fe00000UL)|0x00000000001ff000UL
)) >> 12)
] = npte;
2896 }
2897
2898 pmap_unmap_ptes(pmap, scr3);
2899 pmap_tlb_shootwait();
2900
2901 error = 0;
2902
2903out:
2904 if (pve != NULL((void *)0))
2905 pool_put(&pmap_pv_pool, pve);
2906 if (opve != NULL((void *)0))
2907 pool_put(&pmap_pv_pool, opve);
2908
2909 return error;
2910}
2911
2912int
2913pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
2914{
2915 struct vm_page *ptp;
2916 struct pmap *kpm = pmap_kernel()(&kernel_pmap_store);
2917
2918 if (uvm.page_init_done == 0) {
2919 vaddr_t va;
2920
2921 /*
2922 * we're growing the kernel pmap early (from
2923 * uvm_pageboot_alloc()). this case must be
2924 * handled a little differently.
2925 */
2926
2927 va = pmap_steal_memory(PAGE_SIZE(1 << 12), NULL((void *)0), NULL((void *)0));
2928 *paddrp = PMAP_DIRECT_UNMAP(va)((paddr_t)(va) - (((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)))
;
2929 } else {
2930 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
2931 ptp_va2o(va, level)((((((va) & ~0xffff000000000000)) & ptp_masks[((level
)+1)-1]) >> ptp_shifts[((level)+1)-1]) * (1 << 12
))
, NULL((void *)0),
2932 UVM_PGA_USERESERVE0x0001|UVM_PGA_ZERO0x0002);
2933 if (ptp == NULL((void *)0))
2934 panic("%s: out of memory", __func__);
2935 atomic_clearbits_intx86_atomic_clearbits_u32(&ptp->pg_flags, PG_BUSY0x00000001);
2936 ptp->wire_count = 1;
2937 *paddrp = VM_PAGE_TO_PHYS(ptp)((ptp)->phys_addr);
2938 }
2939 kpm->pm_stats.resident_count++;
2940 return 1;
2941}
2942
2943/*
2944 * Allocate the amount of specified ptps for a ptp level, and populate
2945 * all levels below accordingly, mapping virtual addresses starting at
2946 * kva.
2947 *
2948 * Used by pmap_growkernel.
2949 */
2950void
2951pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps)
2952{
2953 unsigned long i;
2954 vaddr_t va;
2955 paddr_t pa;
2956 unsigned long index, endindex;
2957 int level;
2958 pd_entry_t *pdep;
2959
2960 for (level = lvl; level > 1; level--) {
2961 if (level == PTP_LEVELS4)
2962 pdep = pmap_kernel()(&kernel_pmap_store)->pm_pdir;
2963 else
2964 pdep = normal_pdes[level - 2];
2965 va = kva;
2966 index = pl_i(kva, level)(((((kva) & ~0xffff000000000000)) & ptp_masks[(level)
-1]) >> ptp_shifts[(level)-1])
;
2967 endindex = index + needed_ptps[level - 1];
2968 /*
2969 * XXX special case for first time call.
2970 */
2971 if (nkptp[level - 1] != 0)
2972 index++;
2973 else
2974 endindex--;
2975
2976 for (i = index; i <= endindex; i++) {
2977 pmap_get_physpage(va, level - 1, &pa);
2978 pdep[i] = pa | PG_RW0x0000000000000002UL | PG_V0x0000000000000001UL | pg_nx;
2979 nkptp[level - 1]++;
2980 va += nbpd[level - 1];
2981 }
2982 }
2983}
2984
2985/*
2986 * pmap_growkernel: increase usage of KVM space
2987 *
2988 * => we allocate new PTPs for the kernel and install them in all
2989 * the pmaps on the system.
2990 */
2991
2992static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS0xffff800000000000;
2993
2994vaddr_t
2995pmap_growkernel(vaddr_t maxkvaddr)
2996{
2997 struct pmap *kpm = pmap_kernel()(&kernel_pmap_store), *pm;
2998 int s, i;
2999 unsigned newpdes;
3000 long needed_kptp[PTP_LEVELS4], target_nptp, old;
3001
3002 if (maxkvaddr <= pmap_maxkvaddr)
3003 return pmap_maxkvaddr;
3004
3005 maxkvaddr = x86_round_pdr(maxkvaddr)((((unsigned long)(maxkvaddr)) + ((1ULL << 21) - 1)) &
~((1ULL << 21) - 1))
;
3006 old = nkptp[PTP_LEVELS4 - 1];
3007 /*
3008 * This loop could be optimized more, but pmap_growkernel()
3009 * is called infrequently.
3010 */
3011 for (i = PTP_LEVELS4 - 1; i >= 1; i--) {
3012 target_nptp = pl_i(maxkvaddr, i + 1)(((((maxkvaddr) & ~0xffff000000000000)) & ptp_masks[(
i + 1)-1]) >> ptp_shifts[(i + 1)-1])
-
3013 pl_i(VM_MIN_KERNEL_ADDRESS, i + 1)(((((0xffff800000000000) & ~0xffff000000000000)) & ptp_masks
[(i + 1)-1]) >> ptp_shifts[(i + 1)-1])
;
3014 /*
3015 * XXX only need to check toplevel.
3016 */
3017 if (target_nptp > nkptpmax[i])
3018 panic("%s: out of KVA space", __func__);
3019 needed_kptp[i] = target_nptp - nkptp[i] + 1;
3020 }
3021
3022
3023 s = splhigh()splraise(0xd); /* to be safe */
3024 pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS4, needed_kptp);
3025
3026 /*
3027 * If the number of top level entries changed, update all
3028 * pmaps.
3029 */
3030 if (needed_kptp[PTP_LEVELS4 - 1] != 0) {
3031 newpdes = nkptp[PTP_LEVELS4 - 1] - old;
3032 mtx_enter(&pmaps_lock);
3033 LIST_FOREACH(pm, &pmaps, pm_list)for((pm) = ((&pmaps)->lh_first); (pm)!= ((void *)0); (
pm) = ((pm)->pm_list.le_next))
{
3034 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],__builtin_memcpy((&pm->pm_pdir[256 + old]), (&kpm->
pm_pdir[256 + old]), (newpdes * sizeof (pd_entry_t)))
3035 &kpm->pm_pdir[PDIR_SLOT_KERN + old],__builtin_memcpy((&pm->pm_pdir[256 + old]), (&kpm->
pm_pdir[256 + old]), (newpdes * sizeof (pd_entry_t)))
3036 newpdes * sizeof (pd_entry_t))__builtin_memcpy((&pm->pm_pdir[256 + old]), (&kpm->
pm_pdir[256 + old]), (newpdes * sizeof (pd_entry_t)))
;
3037 }
3038 mtx_leave(&pmaps_lock);
3039 }
3040 pmap_maxkvaddr = maxkvaddr;
3041 splx(s)spllower(s);
3042
3043 return maxkvaddr;
3044}
3045
3046vaddr_t
3047pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end)
3048{
3049 int segno;
3050 u_int npg;
3051 vaddr_t va;
3052 paddr_t pa;
3053 struct vm_physseg *seg;
3054
3055 size = round_page(size)(((size) + ((1 << 12) - 1)) & ~((1 << 12) - 1
))
;
3056 npg = atop(size)((size) >> 12);
3057
3058 for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) {
3059 if (seg->avail_end - seg->avail_start < npg)
3060 continue;
3061 /*
3062 * We can only steal at an ``unused'' segment boundary,
3063 * i.e. either at the start or at the end.
3064 */
3065 if (seg->avail_start == seg->start ||
3066 seg->avail_end == seg->end)
3067 break;
3068 }
3069 if (segno == vm_nphysseg) {
3070 panic("%s: out of memory", __func__);
3071 } else {
3072 if (seg->avail_start == seg->start) {
3073 pa = ptoa(seg->avail_start)((paddr_t)(seg->avail_start) << 12);
3074 seg->avail_start += npg;
3075 seg->start += npg;
3076 } else {
3077 pa = ptoa(seg->avail_end)((paddr_t)(seg->avail_end) << 12) - size;
3078 seg->avail_end -= npg;
3079 seg->end -= npg;
3080 }
3081 /*
3082 * If all the segment has been consumed now, remove it.
3083 * Note that the crash dump code still knows about it
3084 * and will dump it correctly.
3085 */
3086 if (seg->start == seg->end) {
3087 if (vm_nphysseg-- == 1)
3088 panic("%s: out of memory", __func__);
3089 while (segno < vm_nphysseg) {
3090 seg[0] = seg[1]; /* struct copy */
3091 seg++;
3092 segno++;
3093 }
3094 }
3095
3096 va = PMAP_DIRECT_MAP(pa)((vaddr_t)(((((511 - 4) * (1ULL << 39))) | 0xffff000000000000
)) + (pa))
;
3097 memset((void *)va, 0, size)__builtin_memset(((void *)va), (0), (size));
3098 }
3099
3100 if (start != NULL((void *)0))
3101 *start = virtual_avail;
3102 if (end != NULL((void *)0))
3103 *end = VM_MAX_KERNEL_ADDRESS0xffff800100000000;
3104
3105 return (va);
3106}
3107
3108void
3109pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
3110{
3111 *vstartp = virtual_avail;
3112 *vendp = VM_MAX_KERNEL_ADDRESS0xffff800100000000;
3113}
3114
3115/*
3116 * pmap_convert
3117 *
3118 * Converts 'pmap' to the new 'mode'.
3119 *
3120 * Parameters:
3121 * pmap: the pmap to convert
3122 * mode: the new mode (see pmap.h, PMAP_TYPE_xxx)
3123 */
3124void
3125pmap_convert(struct pmap *pmap, int mode)
3126{
3127 pt_entry_t *pte;
3128
3129 pmap->pm_type = mode;
3130
3131 if (mode == PMAP_TYPE_EPT2) {
3132 /* Clear PML4 */
3133 pte = (pt_entry_t *)pmap->pm_pdir;
3134 memset(pte, 0, PAGE_SIZE)__builtin_memset((pte), (0), ((1 << 12)));
3135
3136 /* Give back the meltdown pdir */
3137 if (pmap->pm_pdir_intel != NULL((void *)0)) {
3138 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel);
3139 pmap->pm_pdir_intel = NULL((void *)0);
3140 }
3141 }
3142}
3143
3144#ifdef MULTIPROCESSOR1
3145/*
3146 * Locking for tlb shootdown.
3147 *
3148 * We lock by setting tlb_shoot_wait to the number of cpus that will
3149 * receive our tlb shootdown. After sending the IPIs, we don't need to
3150 * worry about locking order or interrupts spinning for the lock because
3151 * the call that grabs the "lock" isn't the one that releases it. And
3152 * there is nothing that can block the IPI that releases the lock.
3153 *
3154 * The functions are organized so that we first count the number of
3155 * cpus we need to send the IPI to, then we grab the counter, then
3156 * we send the IPIs, then we finally do our own shootdown.
3157 *
3158 * Our shootdown is last to make it parallel with the other cpus
3159 * to shorten the spin time.
3160 *
3161 * Notice that we depend on failures to send IPIs only being able to
3162 * happen during boot. If they happen later, the above assumption
3163 * doesn't hold since we can end up in situations where noone will
3164 * release the lock if we get an interrupt in a bad moment.
3165 */
3166#ifdef MP_LOCKDEBUG
3167#include <ddb/db_output.h>
3168extern int __mp_lock_spinout;
3169#endif
3170
3171volatile long tlb_shoot_wait __attribute__((section(".kudata")));
3172
3173volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata")));
3174volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata")));
3175volatile int tlb_shoot_first_pcid __attribute__((section(".kudata")));
3176
3177
3178/* Obtain the "lock" for TLB shooting */
3179static inline int
3180pmap_start_tlb_shoot(long wait, const char *func)
3181{
3182 int s = splvm()splraise(0xa);
3183
3184 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait)_atomic_cas_ulong((&tlb_shoot_wait), (0), (wait)) != 0) {
3185#ifdef MP_LOCKDEBUG
3186 int nticks = __mp_lock_spinout;
3187#endif
3188 while (tlb_shoot_wait != 0) {
3189 CPU_BUSY_CYCLE()__asm volatile("pause": : : "memory");
3190#ifdef MP_LOCKDEBUG
3191 if (--nticks <= 0) {
3192 db_printf("%s: spun out", func);
3193 db_enter();
3194 nticks = __mp_lock_spinout;
3195 }
3196#endif
3197 }
3198 }
3199
3200 return s;
3201}
3202
3203void
3204pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3205{
3206 struct cpu_info *ci, *self = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})
;
3207 CPU_INFO_ITERATORint cii;
3208 long wait = 0;
3209 u_int64_t mask = 0;
3210 int is_kva = va >= VM_MIN_KERNEL_ADDRESS0xffff800000000000;
3211
3212 CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci->
ci_next)
{
3213 if (ci == self || !(ci->ci_flags & CPUF_RUNNING0x2000))
3214 continue;
3215 if (!is_kva && !pmap_is_active(pm, ci))
3216 continue;
3217 mask |= (1ULL << ci->ci_cpuid);
3218 wait++;
3219 }
3220
3221 if (wait > 0) {
3222 int s = pmap_start_tlb_shoot(wait, __func__);
3223
3224 tlb_shoot_first_pcid = is_kva ? PCID_KERN0 : PCID_PROC1;
3225 tlb_shoot_addr1 = va;
3226 CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci->
ci_next)
{
3227 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3228 continue;
3229 if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG(0xf0 + 1)) != 0)
3230 panic("%s: ipi failed", __func__);
3231 }
3232 splx(s)spllower(s);
3233 }
3234
3235 if (!pmap_use_pcid) {
3236 if (shootself)
3237 pmap_update_pg(va);
3238 } else if (is_kva) {
3239 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3240 invpcid(INVPCID_ADDR0, PCID_KERN0, va);
3241 } else if (shootself) {
3242 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3243 if (cpu_meltdown)
3244 invpcid(INVPCID_ADDR0, PCID_PROC_INTEL2, va);
3245 }
3246}
3247
3248void
3249pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3250{
3251 struct cpu_info *ci, *self = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})
;
3252 CPU_INFO_ITERATORint cii;
3253 long wait = 0;
3254 u_int64_t mask = 0;
3255 int is_kva = sva >= VM_MIN_KERNEL_ADDRESS0xffff800000000000;
3256 vaddr_t va;
3257
3258 CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci->
ci_next)
{
3259 if (ci == self || !(ci->ci_flags & CPUF_RUNNING0x2000))
3260 continue;
3261 if (!is_kva && !pmap_is_active(pm, ci))
3262 continue;
3263 mask |= (1ULL << ci->ci_cpuid);
3264 wait++;
3265 }
3266
3267 if (wait > 0) {
3268 int s = pmap_start_tlb_shoot(wait, __func__);
3269
3270 tlb_shoot_first_pcid = is_kva ? PCID_KERN0 : PCID_PROC1;
3271 tlb_shoot_addr1 = sva;
3272 tlb_shoot_addr2 = eva;
3273 CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci->
ci_next)
{
3274 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3275 continue;
3276 if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE(0xf0 + 2)) != 0)
3277 panic("%s: ipi failed", __func__);
3278 }
3279 splx(s)spllower(s);
3280 }
3281
3282 if (!pmap_use_pcid) {
3283 if (shootself) {
3284 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12))
3285 pmap_update_pg(va);
3286 }
3287 } else if (is_kva) {
3288 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12)) {
3289 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3290 invpcid(INVPCID_ADDR0, PCID_KERN0, va);
3291 }
3292 } else if (shootself) {
3293 if (cpu_meltdown) {
3294 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12)) {
3295 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3296 invpcid(INVPCID_ADDR0, PCID_PROC_INTEL2, va);
3297 }
3298 } else {
3299 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12))
3300 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3301 }
3302 }
3303}
3304
3305void
3306pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3307{
3308 struct cpu_info *ci, *self = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})
;
3309 CPU_INFO_ITERATORint cii;
3310 long wait = 0;
3311 u_int64_t mask = 0;
3312
3313 KASSERT(pm != pmap_kernel())((pm != (&kernel_pmap_store)) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/arch/amd64/amd64/pmap.c", 3313, "pm != pmap_kernel()"
))
;
3314
3315 CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci->
ci_next)
{
3316 if (ci == self || !pmap_is_active(pm, ci) ||
3317 !(ci->ci_flags & CPUF_RUNNING0x2000))
3318 continue;
3319 mask |= (1ULL << ci->ci_cpuid);
3320 wait++;
3321 }
3322
3323 if (wait) {
3324 int s = pmap_start_tlb_shoot(wait, __func__);
3325
3326 CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci->
ci_next)
{
3327 if ((mask & (1ULL << ci->ci_cpuid)) == 0)
3328 continue;
3329 if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB(0xf0 + 0)) != 0)
3330 panic("%s: ipi failed", __func__);
3331 }
3332 splx(s)spllower(s);
3333 }
3334
3335 if (shootself) {
3336 if (!pmap_use_pcid)
3337 tlbflush();
3338 else {
3339 invpcid(INVPCID_PCID1, PCID_PROC1, 0);
3340 if (cpu_meltdown)
3341 invpcid(INVPCID_PCID1, PCID_PROC_INTEL2, 0);
3342 }
3343 }
3344}
3345
3346void
3347pmap_tlb_shootwait(void)
3348{
3349#ifdef MP_LOCKDEBUG
3350 int nticks = __mp_lock_spinout;
3351#endif
3352 while (tlb_shoot_wait != 0) {
3353 CPU_BUSY_CYCLE()__asm volatile("pause": : : "memory");
3354#ifdef MP_LOCKDEBUG
3355 if (--nticks <= 0) {
3356 db_printf("%s: spun out", __func__);
3357 db_enter();
3358 nticks = __mp_lock_spinout;
3359 }
3360#endif
3361 }
3362}
3363
3364#else /* MULTIPROCESSOR */
3365
3366void
3367pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
3368{
3369 if (!pmap_use_pcid) {
3370 if (shootself)
3371 pmap_update_pg(va);
3372 } else if (va >= VM_MIN_KERNEL_ADDRESS0xffff800000000000) {
3373 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3374 invpcid(INVPCID_ADDR0, PCID_KERN0, va);
3375 } else if (shootself) {
3376 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3377 if (cpu_meltdown)
3378 invpcid(INVPCID_ADDR0, PCID_PROC_INTEL2, va);
3379 }
3380}
3381
3382void
3383pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
3384{
3385 vaddr_t va;
3386
3387 if (!pmap_use_pcid) {
3388 if (shootself) {
3389 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12))
3390 pmap_update_pg(va);
3391 }
3392 } else if (sva >= VM_MIN_KERNEL_ADDRESS0xffff800000000000) {
3393 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12)) {
3394 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3395 invpcid(INVPCID_ADDR0, PCID_KERN0, va);
3396 }
3397 } else if (shootself) {
3398 if (cpu_meltdown) {
3399 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12)) {
3400 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3401 invpcid(INVPCID_ADDR0, PCID_PROC_INTEL2, va);
3402 }
3403 } else {
3404 for (va = sva; va < eva; va += PAGE_SIZE(1 << 12))
3405 invpcid(INVPCID_ADDR0, PCID_PROC1, va);
3406 }
3407 }
3408}
3409
3410void
3411pmap_tlb_shoottlb(struct pmap *pm, int shootself)
3412{
3413 if (shootself) {
3414 if (!pmap_use_pcid)
3415 tlbflush();
3416 else {
3417 invpcid(INVPCID_PCID1, PCID_PROC1, 0);
3418 if (cpu_meltdown)
3419 invpcid(INVPCID_PCID1, PCID_PROC_INTEL2, 0);
3420 }
3421 }
3422}
3423#endif /* MULTIPROCESSOR */