File: | uvm/uvm_swap.c |
Warning: | line 1883, column 4 1st function call argument is an uninitialized value |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* $OpenBSD: uvm_swap.c,v 1.168 2023/12/05 15:50:45 claudio Exp $ */ | ||||
2 | /* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */ | ||||
3 | |||||
4 | /* | ||||
5 | * Copyright (c) 1995, 1996, 1997 Matthew R. Green | ||||
6 | * All rights reserved. | ||||
7 | * | ||||
8 | * Redistribution and use in source and binary forms, with or without | ||||
9 | * modification, are permitted provided that the following conditions | ||||
10 | * are met: | ||||
11 | * 1. Redistributions of source code must retain the above copyright | ||||
12 | * notice, this list of conditions and the following disclaimer. | ||||
13 | * 2. Redistributions in binary form must reproduce the above copyright | ||||
14 | * notice, this list of conditions and the following disclaimer in the | ||||
15 | * documentation and/or other materials provided with the distribution. | ||||
16 | * | ||||
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | ||||
18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | ||||
19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | ||||
20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, | ||||
21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | ||||
22 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||||
23 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | ||||
24 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
25 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||||
26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||||
27 | * SUCH DAMAGE. | ||||
28 | * | ||||
29 | * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp | ||||
30 | * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp | ||||
31 | */ | ||||
32 | |||||
33 | #include <sys/param.h> | ||||
34 | #include <sys/systm.h> | ||||
35 | #include <sys/buf.h> | ||||
36 | #include <sys/conf.h> | ||||
37 | #include <sys/proc.h> | ||||
38 | #include <sys/namei.h> | ||||
39 | #include <sys/disklabel.h> | ||||
40 | #include <sys/errno.h> | ||||
41 | #include <sys/kernel.h> | ||||
42 | #include <sys/malloc.h> | ||||
43 | #include <sys/vnode.h> | ||||
44 | #include <sys/fcntl.h> | ||||
45 | #include <sys/extent.h> | ||||
46 | #include <sys/blist.h> | ||||
47 | #include <sys/mount.h> | ||||
48 | #include <sys/mutex.h> | ||||
49 | #include <sys/pool.h> | ||||
50 | #include <sys/syscallargs.h> | ||||
51 | #include <sys/swap.h> | ||||
52 | #include <sys/disk.h> | ||||
53 | #include <sys/task.h> | ||||
54 | #include <sys/pledge.h> | ||||
55 | #if defined(NFSCLIENT1) | ||||
56 | #include <sys/socket.h> | ||||
57 | #include <netinet/in.h> | ||||
58 | #include <nfs/nfsproto.h> | ||||
59 | #include <nfs/nfsdiskless.h> | ||||
60 | #endif | ||||
61 | |||||
62 | #include <uvm/uvm.h> | ||||
63 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
64 | #include <uvm/uvm_swap_encrypt.h> | ||||
65 | #endif | ||||
66 | |||||
67 | #include <sys/specdev.h> | ||||
68 | |||||
69 | #include "vnd.h" | ||||
70 | |||||
71 | /* | ||||
72 | * uvm_swap.c: manage configuration and i/o to swap space. | ||||
73 | */ | ||||
74 | |||||
75 | /* | ||||
76 | * swap space is managed in the following way: | ||||
77 | * | ||||
78 | * each swap partition or file is described by a "swapdev" structure. | ||||
79 | * each "swapdev" structure contains a "swapent" structure which contains | ||||
80 | * information that is passed up to the user (via system calls). | ||||
81 | * | ||||
82 | * each swap partition is assigned a "priority" (int) which controls | ||||
83 | * swap partition usage. | ||||
84 | * | ||||
85 | * the system maintains a global data structure describing all swap | ||||
86 | * partitions/files. there is a sorted LIST of "swappri" structures | ||||
87 | * which describe "swapdev"'s at that priority. this LIST is headed | ||||
88 | * by the "swap_priority" global var. each "swappri" contains a | ||||
89 | * TAILQ of "swapdev" structures at that priority. | ||||
90 | * | ||||
91 | * locking: | ||||
92 | * - swap_syscall_lock (sleep lock): this lock serializes the swapctl | ||||
93 | * system call and prevents the swap priority list from changing | ||||
94 | * while we are in the middle of a system call (e.g. SWAP_STATS). | ||||
95 | * - uvm_swap_data_lock (mutex): this lock protects all swap data | ||||
96 | * structures including the priority list, the swapdev structures, | ||||
97 | * and the swapmap arena. | ||||
98 | * | ||||
99 | * each swap device has the following info: | ||||
100 | * - swap device in use (could be disabled, preventing future use) | ||||
101 | * - swap enabled (allows new allocations on swap) | ||||
102 | * - map info in /dev/drum | ||||
103 | * - vnode pointer | ||||
104 | * for swap files only: | ||||
105 | * - block size | ||||
106 | * - max byte count in buffer | ||||
107 | * - buffer | ||||
108 | * - credentials to use when doing i/o to file | ||||
109 | * | ||||
110 | * userland controls and configures swap with the swapctl(2) system call. | ||||
111 | * the sys_swapctl performs the following operations: | ||||
112 | * [1] SWAP_NSWAP: returns the number of swap devices currently configured | ||||
113 | * [2] SWAP_STATS: given a pointer to an array of swapent structures | ||||
114 | * (passed in via "arg") of a size passed in via "misc" ... we load | ||||
115 | * the current swap config into the array. | ||||
116 | * [3] SWAP_ON: given a pathname in arg (could be device or file) and a | ||||
117 | * priority in "misc", start swapping on it. | ||||
118 | * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device | ||||
119 | * [5] SWAP_CTL: changes the priority of a swap device (new priority in | ||||
120 | * "misc") | ||||
121 | */ | ||||
122 | |||||
123 | /* | ||||
124 | * swapdev: describes a single swap partition/file | ||||
125 | * | ||||
126 | * note the following should be true: | ||||
127 | * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] | ||||
128 | * swd_nblks <= swd_mapsize [because mapsize includes disklabel] | ||||
129 | */ | ||||
130 | struct swapdev { | ||||
131 | struct swapent swd_se; | ||||
132 | #define swd_devswd_se.se_dev swd_se.se_dev /* device id */ | ||||
133 | #define swd_flagsswd_se.se_flags swd_se.se_flags /* flags:inuse/enable/fake */ | ||||
134 | #define swd_priorityswd_se.se_priority swd_se.se_priority /* our priority */ | ||||
135 | #define swd_inuseswd_se.se_inuse swd_se.se_inuse /* blocks used */ | ||||
136 | #define swd_nblksswd_se.se_nblks swd_se.se_nblks /* total blocks */ | ||||
137 | char *swd_path; /* saved pathname of device */ | ||||
138 | int swd_pathlen; /* length of pathname */ | ||||
139 | int swd_npages; /* #pages we can use */ | ||||
140 | int swd_npginuse; /* #pages in use */ | ||||
141 | int swd_npgbad; /* #pages bad */ | ||||
142 | int swd_drumoffset; /* page0 offset in drum */ | ||||
143 | int swd_drumsize; /* #pages in drum */ | ||||
144 | blist_t swd_blist; /* blist for this swapdev */ | ||||
145 | struct vnode *swd_vp; /* backing vnode */ | ||||
146 | TAILQ_ENTRY(swapdev)struct { struct swapdev *tqe_next; struct swapdev **tqe_prev; } swd_next; /* priority tailq */ | ||||
147 | |||||
148 | int swd_bsize; /* blocksize (bytes) */ | ||||
149 | int swd_maxactive; /* max active i/o reqs */ | ||||
150 | int swd_active; /* # of active i/o reqs */ | ||||
151 | struct bufq swd_bufq; | ||||
152 | struct ucred *swd_cred; /* cred for file access */ | ||||
153 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
154 | #define SWD_KEY_SHIFT7 7 /* One key per 0.5 MByte */ | ||||
155 | #define SWD_KEY(x,y)&((x)->swd_keys[((y) - (x)->swd_drumoffset) >> 7]) &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT7]) | ||||
156 | #define SWD_KEY_SIZE(x)(((x) + (1 << 7) - 1) >> 7) (((x) + (1 << SWD_KEY_SHIFT7) - 1) >> SWD_KEY_SHIFT7) | ||||
157 | |||||
158 | #define SWD_DCRYPT_SHIFT5 5 | ||||
159 | #define SWD_DCRYPT_BITS32 32 | ||||
160 | #define SWD_DCRYPT_MASK(32 - 1) (SWD_DCRYPT_BITS32 - 1) | ||||
161 | #define SWD_DCRYPT_OFF(x)((x) >> 5) ((x) >> SWD_DCRYPT_SHIFT5) | ||||
162 | #define SWD_DCRYPT_BIT(x)((x) & (32 - 1)) ((x) & SWD_DCRYPT_MASK(32 - 1)) | ||||
163 | #define SWD_DCRYPT_SIZE(x)((((x) + (32 - 1)) >> 5) * sizeof(u_int32_t)) (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK)(((x) + (32 - 1)) >> 5) * sizeof(u_int32_t)) | ||||
164 | u_int32_t *swd_decrypt; /* bitmap for decryption */ | ||||
165 | struct swap_key *swd_keys; /* keys for different parts */ | ||||
166 | #endif | ||||
167 | }; | ||||
168 | |||||
169 | /* | ||||
170 | * swap device priority entry; the list is kept sorted on `spi_priority'. | ||||
171 | */ | ||||
172 | struct swappri { | ||||
173 | int spi_priority; /* priority */ | ||||
174 | TAILQ_HEAD(spi_swapdev, swapdev)struct spi_swapdev { struct swapdev *tqh_first; struct swapdev **tqh_last; } spi_swapdev; | ||||
175 | /* tailq of swapdevs at this priority */ | ||||
176 | LIST_ENTRY(swappri)struct { struct swappri *le_next; struct swappri **le_prev; } spi_swappri; /* global list of pri's */ | ||||
177 | }; | ||||
178 | |||||
179 | /* | ||||
180 | * The following two structures are used to keep track of data transfers | ||||
181 | * on swap devices associated with regular files. | ||||
182 | * NOTE: this code is more or less a copy of vnd.c; we use the same | ||||
183 | * structure names here to ease porting.. | ||||
184 | */ | ||||
185 | struct vndxfer { | ||||
186 | struct buf *vx_bp; /* Pointer to parent buffer */ | ||||
187 | struct swapdev *vx_sdp; | ||||
188 | int vx_error; | ||||
189 | int vx_pending; /* # of pending aux buffers */ | ||||
190 | int vx_flags; | ||||
191 | #define VX_BUSY1 1 | ||||
192 | #define VX_DEAD2 2 | ||||
193 | }; | ||||
194 | |||||
195 | struct vndbuf { | ||||
196 | struct buf vb_buf; | ||||
197 | struct vndxfer *vb_vnx; | ||||
198 | struct task vb_task; | ||||
199 | }; | ||||
200 | |||||
201 | /* | ||||
202 | * We keep a of pool vndbuf's and vndxfer structures. | ||||
203 | */ | ||||
204 | struct pool vndxfer_pool; | ||||
205 | struct pool vndbuf_pool; | ||||
206 | |||||
207 | |||||
208 | /* | ||||
209 | * local variables | ||||
210 | */ | ||||
211 | struct extent *swapmap; /* controls the mapping of /dev/drum */ | ||||
212 | |||||
213 | /* list of all active swap devices [by priority] */ | ||||
214 | LIST_HEAD(swap_priority, swappri)struct swap_priority { struct swappri *lh_first; }; | ||||
215 | struct swap_priority swap_priority; /* [S] */ | ||||
216 | |||||
217 | /* locks */ | ||||
218 | struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_MPFLOOR){ ((void *)0), ((((0x9)) > 0x0 && ((0x9)) < 0x9 ) ? 0x9 : ((0x9))), 0x0 }; | ||||
219 | struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk"){ 0, "swplk" }; | ||||
220 | |||||
221 | struct mutex oommtx = MUTEX_INITIALIZER(IPL_VM){ ((void *)0), ((((0xa)) > 0x0 && ((0xa)) < 0x9 ) ? 0x9 : ((0xa))), 0x0 }; | ||||
222 | struct vm_page *oompps[SWCLUSTPAGES((64 * 1024) >> 12)]; | ||||
223 | int oom = 0; | ||||
224 | |||||
225 | /* | ||||
226 | * prototypes | ||||
227 | */ | ||||
228 | void swapdrum_add(struct swapdev *, int); | ||||
229 | struct swapdev *swapdrum_getsdp(int); | ||||
230 | |||||
231 | struct swapdev *swaplist_find(struct vnode *, int); | ||||
232 | void swaplist_insert(struct swapdev *, | ||||
233 | struct swappri *, int); | ||||
234 | void swaplist_trim(void); | ||||
235 | |||||
236 | int swap_on(struct proc *, struct swapdev *); | ||||
237 | int swap_off(struct proc *, struct swapdev *); | ||||
238 | |||||
239 | void sw_reg_strategy(struct swapdev *, struct buf *, int); | ||||
240 | void sw_reg_iodone(struct buf *); | ||||
241 | void sw_reg_iodone_internal(void *); | ||||
242 | void sw_reg_start(struct swapdev *); | ||||
243 | |||||
244 | int uvm_swap_io(struct vm_page **, int, int, int); | ||||
245 | |||||
246 | void swapmount(void); | ||||
247 | int uvm_swap_allocpages(struct vm_page **, int, int); | ||||
248 | |||||
249 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
250 | /* for swap encrypt */ | ||||
251 | void uvm_swap_markdecrypt(struct swapdev *, int, int, int); | ||||
252 | boolean_t uvm_swap_needdecrypt(struct swapdev *, int); | ||||
253 | void uvm_swap_initcrypt(struct swapdev *, int); | ||||
254 | #endif | ||||
255 | |||||
256 | /* | ||||
257 | * uvm_swap_init: init the swap system data structures and locks | ||||
258 | * | ||||
259 | * => called at boot time from init_main.c after the filesystems | ||||
260 | * are brought up (which happens after uvm_init()) | ||||
261 | */ | ||||
262 | void | ||||
263 | uvm_swap_init(void) | ||||
264 | { | ||||
265 | int error; | ||||
266 | |||||
267 | /* | ||||
268 | * first, init the swap list, its counter, and its lock. | ||||
269 | * then get a handle on the vnode for /dev/drum by using | ||||
270 | * the its dev_t number ("swapdev", from MD conf.c). | ||||
271 | */ | ||||
272 | LIST_INIT(&swap_priority)do { ((&swap_priority)->lh_first) = ((void *)0); } while (0); | ||||
273 | uvmexp.nswapdev = 0; | ||||
274 | |||||
275 | if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp)) | ||||
276 | panic("uvm_swap_init: can't get vnode for swap device"); | ||||
277 | |||||
278 | /* | ||||
279 | * create swap block extent to map /dev/drum. The extent spans | ||||
280 | * 1 to INT_MAX allows 2 gigablocks of swap space. Note that | ||||
281 | * block 0 is reserved (used to indicate an allocation failure, | ||||
282 | * or no allocation). | ||||
283 | */ | ||||
284 | swapmap = extent_create("swapmap", 1, INT_MAX0x7fffffff, | ||||
285 | M_VMSWAP92, 0, 0, EX_NOWAIT0x0000); | ||||
286 | if (swapmap == 0) | ||||
287 | panic("uvm_swap_init: extent_create failed"); | ||||
288 | |||||
289 | /* allocate pools for structures used for swapping to files. */ | ||||
290 | pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, IPL_BIO0x3, 0, | ||||
291 | "swp vnx", NULL((void *)0)); | ||||
292 | pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, IPL_BIO0x3, 0, | ||||
293 | "swp vnd", NULL((void *)0)); | ||||
294 | |||||
295 | /* allocate pages for OOM situations. */ | ||||
296 | error = uvm_swap_allocpages(oompps, SWCLUSTPAGES((64 * 1024) >> 12), UVM_PLA_NOWAIT0x0002); | ||||
297 | KASSERT(error == 0)((error == 0) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c" , 297, "error == 0")); | ||||
298 | |||||
299 | /* Setup the initial swap partition */ | ||||
300 | swapmount(); | ||||
301 | } | ||||
302 | |||||
303 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
304 | void | ||||
305 | uvm_swap_initcrypt_all(void) | ||||
306 | { | ||||
307 | struct swapdev *sdp; | ||||
308 | struct swappri *spp; | ||||
309 | int npages; | ||||
310 | |||||
311 | |||||
312 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
313 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
314 | if (sdp->swd_decrypt == NULL((void *)0)) { | ||||
315 | npages = dbtob((uint64_t)sdp->swd_nblks)(((uint64_t)sdp->swd_se.se_nblks) << 9) >> | ||||
316 | PAGE_SHIFT12; | ||||
317 | uvm_swap_initcrypt(sdp, npages); | ||||
318 | } | ||||
319 | } | ||||
320 | } | ||||
321 | } | ||||
322 | |||||
323 | void | ||||
324 | uvm_swap_initcrypt(struct swapdev *sdp, int npages) | ||||
325 | { | ||||
326 | /* | ||||
327 | * keep information if a page needs to be decrypted when we get it | ||||
328 | * from the swap device. | ||||
329 | * We cannot chance a malloc later, if we are doing ASYNC puts, | ||||
330 | * we may not call malloc with M_WAITOK. This consumes only | ||||
331 | * 8KB memory for a 256MB swap partition. | ||||
332 | */ | ||||
333 | sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages)((((npages) + (32 - 1)) >> 5) * sizeof(u_int32_t)), M_VMSWAP92, | ||||
334 | M_WAITOK0x0001|M_ZERO0x0008); | ||||
335 | sdp->swd_keys = mallocarray(SWD_KEY_SIZE(npages)(((npages) + (1 << 7) - 1) >> 7), | ||||
336 | sizeof(struct swap_key), M_VMSWAP92, M_WAITOK0x0001|M_ZERO0x0008); | ||||
337 | } | ||||
338 | |||||
339 | #endif /* UVM_SWAP_ENCRYPT */ | ||||
340 | |||||
341 | int | ||||
342 | uvm_swap_allocpages(struct vm_page **pps, int npages, int flags) | ||||
343 | { | ||||
344 | struct pglist pgl; | ||||
345 | int error, i; | ||||
346 | |||||
347 | KASSERT(npages <= SWCLUSTPAGES)((npages <= ((64 * 1024) >> 12)) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c", 347, "npages <= SWCLUSTPAGES" )); | ||||
348 | |||||
349 | TAILQ_INIT(&pgl)do { (&pgl)->tqh_first = ((void *)0); (&pgl)->tqh_last = &(&pgl)->tqh_first; } while (0); | ||||
350 | again: | ||||
351 | error = uvm_pglistalloc(npages * PAGE_SIZE(1 << 12), dma_constraint.ucr_low, | ||||
352 | dma_constraint.ucr_high, 0, 0, &pgl, npages, flags); | ||||
353 | if (error && (curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc == uvm.pagedaemon_proc)) { | ||||
354 | mtx_enter(&oommtx); | ||||
355 | if (oom) { | ||||
356 | msleep_nsec(&oom, &oommtx, PVM4 | PNORELOCK0x200, | ||||
357 | "oom", INFSLP0xffffffffffffffffULL); | ||||
358 | goto again; | ||||
359 | } | ||||
360 | oom = 1; | ||||
361 | for (i = 0; i < npages; i++) { | ||||
362 | pps[i] = oompps[i]; | ||||
363 | atomic_setbits_intx86_atomic_setbits_u32(&pps[i]->pg_flags, PG_BUSY0x00000001); | ||||
364 | } | ||||
365 | mtx_leave(&oommtx); | ||||
366 | return 0; | ||||
367 | } | ||||
368 | if (error) | ||||
369 | return error; | ||||
370 | |||||
371 | for (i = 0; i < npages; i++) { | ||||
372 | pps[i] = TAILQ_FIRST(&pgl)((&pgl)->tqh_first); | ||||
373 | /* *sigh* */ | ||||
374 | atomic_setbits_intx86_atomic_setbits_u32(&pps[i]->pg_flags, PG_BUSY0x00000001); | ||||
375 | TAILQ_REMOVE(&pgl, pps[i], pageq)do { if (((pps[i])->pageq.tqe_next) != ((void *)0)) (pps[i ])->pageq.tqe_next->pageq.tqe_prev = (pps[i])->pageq .tqe_prev; else (&pgl)->tqh_last = (pps[i])->pageq. tqe_prev; *(pps[i])->pageq.tqe_prev = (pps[i])->pageq.tqe_next ; ((pps[i])->pageq.tqe_prev) = ((void *)-1); ((pps[i])-> pageq.tqe_next) = ((void *)-1); } while (0); | ||||
376 | } | ||||
377 | |||||
378 | return 0; | ||||
379 | } | ||||
380 | |||||
381 | void | ||||
382 | uvm_swap_freepages(struct vm_page **pps, int npages) | ||||
383 | { | ||||
384 | int i; | ||||
385 | |||||
386 | if (pps[0] == oompps[0]) { | ||||
387 | for (i = 0; i < npages; i++) | ||||
388 | uvm_pageclean(pps[i]); | ||||
389 | |||||
390 | mtx_enter(&oommtx); | ||||
391 | KASSERT(oom == 1)((oom == 1) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c" , 391, "oom == 1")); | ||||
392 | oom = 0; | ||||
393 | mtx_leave(&oommtx); | ||||
394 | wakeup(&oom); | ||||
395 | return; | ||||
396 | } | ||||
397 | |||||
398 | uvm_lock_pageq()mtx_enter(&uvm.pageqlock); | ||||
399 | for (i = 0; i < npages; i++) | ||||
400 | uvm_pagefree(pps[i]); | ||||
401 | uvm_unlock_pageq()mtx_leave(&uvm.pageqlock); | ||||
402 | |||||
403 | } | ||||
404 | |||||
405 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
406 | /* | ||||
407 | * Mark pages on the swap device for later decryption | ||||
408 | */ | ||||
409 | |||||
410 | void | ||||
411 | uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages, | ||||
412 | int decrypt) | ||||
413 | { | ||||
414 | int pagestart, i; | ||||
415 | int off, bit; | ||||
416 | |||||
417 | if (!sdp) | ||||
418 | return; | ||||
419 | |||||
420 | pagestart = startslot - sdp->swd_drumoffset; | ||||
421 | for (i = 0; i < npages; i++, pagestart++) { | ||||
422 | off = SWD_DCRYPT_OFF(pagestart)((pagestart) >> 5); | ||||
423 | bit = SWD_DCRYPT_BIT(pagestart)((pagestart) & (32 - 1)); | ||||
424 | if (decrypt) | ||||
425 | /* pages read need decryption */ | ||||
426 | sdp->swd_decrypt[off] |= 1 << bit; | ||||
427 | else | ||||
428 | /* pages read do not need decryption */ | ||||
429 | sdp->swd_decrypt[off] &= ~(1 << bit); | ||||
430 | } | ||||
431 | } | ||||
432 | |||||
433 | /* | ||||
434 | * Check if the page that we got from disk needs to be decrypted | ||||
435 | */ | ||||
436 | |||||
437 | boolean_t | ||||
438 | uvm_swap_needdecrypt(struct swapdev *sdp, int off) | ||||
439 | { | ||||
440 | if (!sdp) | ||||
441 | return FALSE0; | ||||
442 | |||||
443 | off -= sdp->swd_drumoffset; | ||||
444 | return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)((off) >> 5)] & (1 << SWD_DCRYPT_BIT(off)((off) & (32 - 1))) ? | ||||
445 | TRUE1 : FALSE0; | ||||
446 | } | ||||
447 | |||||
448 | void | ||||
449 | uvm_swap_finicrypt_all(void) | ||||
450 | { | ||||
451 | struct swapdev *sdp; | ||||
452 | struct swappri *spp; | ||||
453 | struct swap_key *key; | ||||
454 | unsigned int nkeys; | ||||
455 | |||||
456 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
457 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
458 | if (sdp->swd_decrypt == NULL((void *)0)) | ||||
459 | continue; | ||||
460 | |||||
461 | nkeys = dbtob((uint64_t)sdp->swd_nblks)(((uint64_t)sdp->swd_se.se_nblks) << 9) >> PAGE_SHIFT12; | ||||
462 | key = sdp->swd_keys + (SWD_KEY_SIZE(nkeys)(((nkeys) + (1 << 7) - 1) >> 7) - 1); | ||||
463 | do { | ||||
464 | if (key->refcount != 0) | ||||
465 | swap_key_delete(key); | ||||
466 | } while (key-- != sdp->swd_keys); | ||||
467 | } | ||||
468 | } | ||||
469 | } | ||||
470 | #endif /* UVM_SWAP_ENCRYPT */ | ||||
471 | |||||
472 | /* | ||||
473 | * swaplist functions: functions that operate on the list of swap | ||||
474 | * devices on the system. | ||||
475 | */ | ||||
476 | |||||
477 | /* | ||||
478 | * swaplist_insert: insert swap device "sdp" into the global list | ||||
479 | * | ||||
480 | * => caller must hold both swap_syscall_lock and uvm_swap_data_lock | ||||
481 | * => caller must provide a newly allocated swappri structure (we will | ||||
482 | * FREE it if we don't need it... this it to prevent allocation | ||||
483 | * blocking here while adding swap) | ||||
484 | */ | ||||
485 | void | ||||
486 | swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) | ||||
487 | { | ||||
488 | struct swappri *spp, *pspp; | ||||
489 | |||||
490 | KASSERT(rw_write_held(&swap_syscall_lock))((rw_write_held(&swap_syscall_lock)) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c", 490, "rw_write_held(&swap_syscall_lock)" )); | ||||
491 | MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock)do { if (((&uvm_swap_data_lock)->mtx_owner != ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci ) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci; })) && !(panicstr || db_active)) panic("mutex %p not held in %s" , (&uvm_swap_data_lock), __func__); } while (0); | ||||
492 | |||||
493 | /* | ||||
494 | * find entry at or after which to insert the new device. | ||||
495 | */ | ||||
496 | pspp = NULL((void *)0); | ||||
497 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
498 | if (priority <= spp->spi_priority) | ||||
499 | break; | ||||
500 | pspp = spp; | ||||
501 | } | ||||
502 | |||||
503 | /* | ||||
504 | * new priority? | ||||
505 | */ | ||||
506 | if (spp == NULL((void *)0) || spp->spi_priority != priority) { | ||||
507 | spp = newspp; /* use newspp! */ | ||||
508 | |||||
509 | spp->spi_priority = priority; | ||||
510 | TAILQ_INIT(&spp->spi_swapdev)do { (&spp->spi_swapdev)->tqh_first = ((void *)0); ( &spp->spi_swapdev)->tqh_last = &(&spp->spi_swapdev )->tqh_first; } while (0); | ||||
511 | |||||
512 | if (pspp) | ||||
513 | LIST_INSERT_AFTER(pspp, spp, spi_swappri)do { if (((spp)->spi_swappri.le_next = (pspp)->spi_swappri .le_next) != ((void *)0)) (pspp)->spi_swappri.le_next-> spi_swappri.le_prev = &(spp)->spi_swappri.le_next; (pspp )->spi_swappri.le_next = (spp); (spp)->spi_swappri.le_prev = &(pspp)->spi_swappri.le_next; } while (0); | ||||
514 | else | ||||
515 | LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri)do { if (((spp)->spi_swappri.le_next = (&swap_priority )->lh_first) != ((void *)0)) (&swap_priority)->lh_first ->spi_swappri.le_prev = &(spp)->spi_swappri.le_next ; (&swap_priority)->lh_first = (spp); (spp)->spi_swappri .le_prev = &(&swap_priority)->lh_first; } while (0 ); | ||||
516 | } else { | ||||
517 | /* we don't need a new priority structure, free it */ | ||||
518 | free(newspp, M_VMSWAP92, sizeof(*newspp)); | ||||
519 | } | ||||
520 | |||||
521 | /* | ||||
522 | * priority found (or created). now insert on the priority's | ||||
523 | * tailq list and bump the total number of swapdevs. | ||||
524 | */ | ||||
525 | sdp->swd_priorityswd_se.se_priority = priority; | ||||
526 | TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next)do { (sdp)->swd_next.tqe_next = ((void *)0); (sdp)->swd_next .tqe_prev = (&spp->spi_swapdev)->tqh_last; *(&spp ->spi_swapdev)->tqh_last = (sdp); (&spp->spi_swapdev )->tqh_last = &(sdp)->swd_next.tqe_next; } while (0 ); | ||||
527 | uvmexp.nswapdev++; | ||||
528 | } | ||||
529 | |||||
530 | /* | ||||
531 | * swaplist_find: find and optionally remove a swap device from the | ||||
532 | * global list. | ||||
533 | * | ||||
534 | * => caller must hold both swap_syscall_lock and uvm_swap_data_lock | ||||
535 | * => we return the swapdev we found (and removed) | ||||
536 | */ | ||||
537 | struct swapdev * | ||||
538 | swaplist_find(struct vnode *vp, boolean_t remove) | ||||
539 | { | ||||
540 | struct swapdev *sdp; | ||||
541 | struct swappri *spp; | ||||
542 | |||||
543 | KASSERT(rw_write_held(&swap_syscall_lock))((rw_write_held(&swap_syscall_lock)) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c", 543, "rw_write_held(&swap_syscall_lock)" )); | ||||
544 | MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock)do { if (((&uvm_swap_data_lock)->mtx_owner != ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci ) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci; })) && !(panicstr || db_active)) panic("mutex %p not held in %s" , (&uvm_swap_data_lock), __func__); } while (0); | ||||
545 | |||||
546 | /* | ||||
547 | * search the lists for the requested vp | ||||
548 | */ | ||||
549 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
550 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
551 | if (sdp->swd_vp != vp) | ||||
552 | continue; | ||||
553 | if (remove) { | ||||
554 | TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next)do { if (((sdp)->swd_next.tqe_next) != ((void *)0)) (sdp)-> swd_next.tqe_next->swd_next.tqe_prev = (sdp)->swd_next. tqe_prev; else (&spp->spi_swapdev)->tqh_last = (sdp )->swd_next.tqe_prev; *(sdp)->swd_next.tqe_prev = (sdp) ->swd_next.tqe_next; ((sdp)->swd_next.tqe_prev) = ((void *)-1); ((sdp)->swd_next.tqe_next) = ((void *)-1); } while (0); | ||||
555 | uvmexp.nswapdev--; | ||||
556 | } | ||||
557 | return (sdp); | ||||
558 | } | ||||
559 | } | ||||
560 | return (NULL((void *)0)); | ||||
561 | } | ||||
562 | |||||
563 | |||||
564 | /* | ||||
565 | * swaplist_trim: scan priority list for empty priority entries and kill | ||||
566 | * them. | ||||
567 | * | ||||
568 | * => caller must hold both swap_syscall_lock and uvm_swap_data_lock | ||||
569 | */ | ||||
570 | void | ||||
571 | swaplist_trim(void) | ||||
572 | { | ||||
573 | struct swappri *spp, *nextspp; | ||||
574 | |||||
575 | KASSERT(rw_write_held(&swap_syscall_lock))((rw_write_held(&swap_syscall_lock)) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c", 575, "rw_write_held(&swap_syscall_lock)" )); | ||||
576 | MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock)do { if (((&uvm_swap_data_lock)->mtx_owner != ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci ) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci; })) && !(panicstr || db_active)) panic("mutex %p not held in %s" , (&uvm_swap_data_lock), __func__); } while (0); | ||||
577 | |||||
578 | LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp)for ((spp) = ((&swap_priority)->lh_first); (spp) && ((nextspp) = ((spp)->spi_swappri.le_next), 1); (spp) = (nextspp )) { | ||||
579 | if (!TAILQ_EMPTY(&spp->spi_swapdev)(((&spp->spi_swapdev)->tqh_first) == ((void *)0))) | ||||
580 | continue; | ||||
581 | LIST_REMOVE(spp, spi_swappri)do { if ((spp)->spi_swappri.le_next != ((void *)0)) (spp)-> spi_swappri.le_next->spi_swappri.le_prev = (spp)->spi_swappri .le_prev; *(spp)->spi_swappri.le_prev = (spp)->spi_swappri .le_next; ((spp)->spi_swappri.le_prev) = ((void *)-1); ((spp )->spi_swappri.le_next) = ((void *)-1); } while (0); | ||||
582 | free(spp, M_VMSWAP92, sizeof(*spp)); | ||||
583 | } | ||||
584 | } | ||||
585 | |||||
586 | /* | ||||
587 | * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. | ||||
588 | * | ||||
589 | * => caller must hold swap_syscall_lock | ||||
590 | * => uvm_swap_data_lock should be unlocked (we may sleep) | ||||
591 | */ | ||||
592 | void | ||||
593 | swapdrum_add(struct swapdev *sdp, int npages) | ||||
594 | { | ||||
595 | u_long result; | ||||
596 | |||||
597 | if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY,extent_alloc_subregion((swapmap), (swapmap)->ex_start, (swapmap )->ex_end, (npages), (1), (0), (0), (0x0001), (&result )) | ||||
598 | EX_WAITOK, &result)extent_alloc_subregion((swapmap), (swapmap)->ex_start, (swapmap )->ex_end, (npages), (1), (0), (0), (0x0001), (&result ))) | ||||
599 | panic("swapdrum_add"); | ||||
600 | |||||
601 | sdp->swd_drumoffset = result; | ||||
602 | sdp->swd_drumsize = npages; | ||||
603 | } | ||||
604 | |||||
605 | /* | ||||
606 | * swapdrum_getsdp: given a page offset in /dev/drum, convert it back | ||||
607 | * to the "swapdev" that maps that section of the drum. | ||||
608 | * | ||||
609 | * => each swapdev takes one big contig chunk of the drum | ||||
610 | * => caller must hold uvm_swap_data_lock | ||||
611 | */ | ||||
612 | struct swapdev * | ||||
613 | swapdrum_getsdp(int pgno) | ||||
614 | { | ||||
615 | struct swapdev *sdp; | ||||
616 | struct swappri *spp; | ||||
617 | |||||
618 | MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock)do { if (((&uvm_swap_data_lock)->mtx_owner != ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci ) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci; })) && !(panicstr || db_active)) panic("mutex %p not held in %s" , (&uvm_swap_data_lock), __func__); } while (0); | ||||
619 | |||||
620 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
621 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
622 | if (pgno >= sdp->swd_drumoffset && | ||||
623 | pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { | ||||
624 | return sdp; | ||||
625 | } | ||||
626 | } | ||||
627 | } | ||||
628 | return NULL((void *)0); | ||||
629 | } | ||||
630 | |||||
631 | |||||
632 | /* | ||||
633 | * sys_swapctl: main entry point for swapctl(2) system call | ||||
634 | * [with two helper functions: swap_on and swap_off] | ||||
635 | */ | ||||
636 | int | ||||
637 | sys_swapctl(struct proc *p, void *v, register_t *retval) | ||||
638 | { | ||||
639 | struct sys_swapctl_args /* { | ||||
640 | syscallarg(int) cmd; | ||||
641 | syscallarg(void *) arg; | ||||
642 | syscallarg(int) misc; | ||||
643 | } */ *uap = (struct sys_swapctl_args *)v; | ||||
644 | struct vnode *vp; | ||||
645 | struct nameidata nd; | ||||
646 | struct swappri *spp; | ||||
647 | struct swapdev *sdp; | ||||
648 | struct swapent *sep; | ||||
649 | char userpath[MAXPATHLEN1024]; | ||||
650 | size_t len; | ||||
651 | int count, error, misc; | ||||
652 | int priority; | ||||
653 | |||||
654 | misc = SCARG(uap, misc)((uap)->misc.le.datum); | ||||
655 | |||||
656 | if ((error = pledge_swapctl(p, SCARG(uap, cmd)((uap)->cmd.le.datum)))) | ||||
657 | return error; | ||||
658 | |||||
659 | /* | ||||
660 | * ensure serialized syscall access by grabbing the swap_syscall_lock | ||||
661 | */ | ||||
662 | rw_enter_write(&swap_syscall_lock); | ||||
663 | |||||
664 | /* | ||||
665 | * we handle the non-priv NSWAP and STATS request first. | ||||
666 | * | ||||
667 | * SWAP_NSWAP: return number of config'd swap devices | ||||
668 | * [can also be obtained with uvmexp sysctl] | ||||
669 | */ | ||||
670 | if (SCARG(uap, cmd)((uap)->cmd.le.datum) == SWAP_NSWAP3) { | ||||
671 | *retval = uvmexp.nswapdev; | ||||
672 | error = 0; | ||||
673 | goto out; | ||||
674 | } | ||||
675 | |||||
676 | /* | ||||
677 | * SWAP_STATS: get stats on current # of configured swap devs | ||||
678 | * | ||||
679 | * note that the swap_priority list can't change as long | ||||
680 | * as we are holding the swap_syscall_lock. we don't want | ||||
681 | * to grab the uvm_swap_data_lock because we may fault&sleep during | ||||
682 | * copyout() and we don't want to be holding that lock then! | ||||
683 | */ | ||||
684 | if (SCARG(uap, cmd)((uap)->cmd.le.datum) == SWAP_STATS4) { | ||||
685 | sep = (struct swapent *)SCARG(uap, arg)((uap)->arg.le.datum); | ||||
686 | count = 0; | ||||
687 | |||||
688 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
689 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
690 | if (count >= misc) | ||||
691 | continue; | ||||
692 | |||||
693 | sdp->swd_inuseswd_se.se_inuse = | ||||
694 | btodb((u_int64_t)sdp->swd_npginuse <<(((u_int64_t)sdp->swd_npginuse << 12) >> 9) | ||||
695 | PAGE_SHIFT)(((u_int64_t)sdp->swd_npginuse << 12) >> 9); | ||||
696 | error = copyout(&sdp->swd_se, sep, | ||||
697 | sizeof(struct swapent)); | ||||
698 | if (error) | ||||
699 | goto out; | ||||
700 | |||||
701 | /* now copy out the path if necessary */ | ||||
702 | error = copyoutstr(sdp->swd_path, | ||||
703 | sep->se_path, sizeof(sep->se_path), NULL((void *)0)); | ||||
704 | if (error) | ||||
705 | goto out; | ||||
706 | |||||
707 | count++; | ||||
708 | sep++; | ||||
709 | } | ||||
710 | } | ||||
711 | |||||
712 | *retval = count; | ||||
713 | error = 0; | ||||
714 | goto out; | ||||
715 | } | ||||
716 | |||||
717 | /* all other requests require superuser privs. verify. */ | ||||
718 | if ((error = suser(p))) | ||||
719 | goto out; | ||||
720 | |||||
721 | /* | ||||
722 | * at this point we expect a path name in arg. we will | ||||
723 | * use namei() to gain a vnode reference (vref), and lock | ||||
724 | * the vnode (VOP_LOCK). | ||||
725 | */ | ||||
726 | error = copyinstr(SCARG(uap, arg)((uap)->arg.le.datum), userpath, sizeof(userpath), &len); | ||||
727 | if (error) | ||||
728 | goto out; | ||||
729 | disk_map(userpath, userpath, sizeof(userpath), DM_OPENBLCK0x2); | ||||
730 | NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, userpath, p)ndinitat(&nd, 0, 0x0040|0x0004, UIO_SYSSPACE, -100, userpath , p); | ||||
731 | if ((error = namei(&nd))) | ||||
732 | goto out; | ||||
733 | vp = nd.ni_vp; | ||||
734 | /* note: "vp" is referenced and locked */ | ||||
735 | |||||
736 | error = 0; /* assume no error */ | ||||
737 | switch(SCARG(uap, cmd)((uap)->cmd.le.datum)) { | ||||
738 | case SWAP_DUMPDEV7: | ||||
739 | if (vp->v_type != VBLK) { | ||||
740 | error = ENOTBLK15; | ||||
741 | break; | ||||
742 | } | ||||
743 | dumpdev = vp->v_rdevv_un.vu_specinfo->si_rdev; | ||||
744 | break; | ||||
745 | case SWAP_CTL5: | ||||
746 | /* | ||||
747 | * get new priority, remove old entry (if any) and then | ||||
748 | * reinsert it in the correct place. finally, prune out | ||||
749 | * any empty priority structures. | ||||
750 | */ | ||||
751 | priority = SCARG(uap, misc)((uap)->misc.le.datum); | ||||
752 | spp = malloc(sizeof *spp, M_VMSWAP92, M_WAITOK0x0001); | ||||
753 | mtx_enter(&uvm_swap_data_lock); | ||||
754 | if ((sdp = swaplist_find(vp, 1)) == NULL((void *)0)) { | ||||
755 | error = ENOENT2; | ||||
756 | } else { | ||||
757 | swaplist_insert(sdp, spp, priority); | ||||
758 | swaplist_trim(); | ||||
759 | } | ||||
760 | mtx_leave(&uvm_swap_data_lock); | ||||
761 | if (error) | ||||
762 | free(spp, M_VMSWAP92, sizeof(*spp)); | ||||
763 | break; | ||||
764 | case SWAP_ON1: | ||||
765 | /* | ||||
766 | * If the device is a regular file, make sure the filesystem | ||||
767 | * can be used for swapping. | ||||
768 | */ | ||||
769 | if (vp->v_type == VREG && | ||||
770 | (vp->v_mount->mnt_flag & MNT_SWAPPABLE0x00200000) == 0) { | ||||
771 | error = ENOTSUP91; | ||||
772 | break; | ||||
773 | } | ||||
774 | |||||
775 | /* | ||||
776 | * check for duplicates. if none found, then insert a | ||||
777 | * dummy entry on the list to prevent someone else from | ||||
778 | * trying to enable this device while we are working on | ||||
779 | * it. | ||||
780 | */ | ||||
781 | |||||
782 | priority = SCARG(uap, misc)((uap)->misc.le.datum); | ||||
783 | sdp = malloc(sizeof *sdp, M_VMSWAP92, M_WAITOK0x0001|M_ZERO0x0008); | ||||
784 | spp = malloc(sizeof *spp, M_VMSWAP92, M_WAITOK0x0001); | ||||
785 | sdp->swd_flagsswd_se.se_flags = SWF_FAKE0x00000008; /* placeholder only */ | ||||
786 | sdp->swd_vp = vp; | ||||
787 | sdp->swd_devswd_se.se_dev = (vp->v_type == VBLK) ? vp->v_rdevv_un.vu_specinfo->si_rdev : NODEV(dev_t)(-1); | ||||
788 | |||||
789 | /* | ||||
790 | * XXX Is NFS elaboration necessary? | ||||
791 | */ | ||||
792 | if (vp->v_type == VREG) { | ||||
793 | sdp->swd_cred = crdup(p->p_ucred); | ||||
794 | } | ||||
795 | |||||
796 | mtx_enter(&uvm_swap_data_lock); | ||||
797 | if (swaplist_find(vp, 0) != NULL((void *)0)) { | ||||
798 | error = EBUSY16; | ||||
799 | mtx_leave(&uvm_swap_data_lock); | ||||
800 | if (vp->v_type == VREG) { | ||||
801 | crfree(sdp->swd_cred); | ||||
802 | } | ||||
803 | free(sdp, M_VMSWAP92, sizeof *sdp); | ||||
804 | free(spp, M_VMSWAP92, sizeof *spp); | ||||
805 | break; | ||||
806 | } | ||||
807 | swaplist_insert(sdp, spp, priority); | ||||
808 | mtx_leave(&uvm_swap_data_lock); | ||||
809 | |||||
810 | sdp->swd_pathlen = len; | ||||
811 | sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP92, M_WAITOK0x0001); | ||||
812 | strlcpy(sdp->swd_path, userpath, len); | ||||
813 | |||||
814 | /* | ||||
815 | * we've now got a FAKE placeholder in the swap list. | ||||
816 | * now attempt to enable swap on it. if we fail, undo | ||||
817 | * what we've done and kill the fake entry we just inserted. | ||||
818 | * if swap_on is a success, it will clear the SWF_FAKE flag | ||||
819 | */ | ||||
820 | |||||
821 | if ((error = swap_on(p, sdp)) != 0) { | ||||
822 | mtx_enter(&uvm_swap_data_lock); | ||||
823 | (void) swaplist_find(vp, 1); /* kill fake entry */ | ||||
824 | swaplist_trim(); | ||||
825 | mtx_leave(&uvm_swap_data_lock); | ||||
826 | if (vp->v_type == VREG) { | ||||
827 | crfree(sdp->swd_cred); | ||||
828 | } | ||||
829 | free(sdp->swd_path, M_VMSWAP92, sdp->swd_pathlen); | ||||
830 | free(sdp, M_VMSWAP92, sizeof(*sdp)); | ||||
831 | break; | ||||
832 | } | ||||
833 | break; | ||||
834 | case SWAP_OFF2: | ||||
835 | mtx_enter(&uvm_swap_data_lock); | ||||
836 | if ((sdp = swaplist_find(vp, 0)) == NULL((void *)0)) { | ||||
837 | mtx_leave(&uvm_swap_data_lock); | ||||
838 | error = ENXIO6; | ||||
839 | break; | ||||
840 | } | ||||
841 | |||||
842 | /* | ||||
843 | * If a device isn't in use or enabled, we | ||||
844 | * can't stop swapping from it (again). | ||||
845 | */ | ||||
846 | if ((sdp->swd_flagsswd_se.se_flags & (SWF_INUSE0x00000001|SWF_ENABLE0x00000002)) == 0) { | ||||
847 | mtx_leave(&uvm_swap_data_lock); | ||||
848 | error = EBUSY16; | ||||
849 | break; | ||||
850 | } | ||||
851 | |||||
852 | /* | ||||
853 | * do the real work. | ||||
854 | */ | ||||
855 | error = swap_off(p, sdp); | ||||
856 | break; | ||||
857 | default: | ||||
858 | error = EINVAL22; | ||||
859 | } | ||||
860 | |||||
861 | /* done! release the ref gained by namei() and unlock. */ | ||||
862 | vput(vp); | ||||
863 | |||||
864 | out: | ||||
865 | rw_exit_write(&swap_syscall_lock); | ||||
866 | |||||
867 | return (error); | ||||
868 | } | ||||
869 | |||||
870 | /* | ||||
871 | * swap_on: attempt to enable a swapdev for swapping. note that the | ||||
872 | * swapdev is already on the global list, but disabled (marked | ||||
873 | * SWF_FAKE). | ||||
874 | * | ||||
875 | * => we avoid the start of the disk (to protect disk labels) | ||||
876 | * => caller should leave uvm_swap_data_lock unlocked, we may lock it | ||||
877 | * if needed. | ||||
878 | */ | ||||
879 | int | ||||
880 | swap_on(struct proc *p, struct swapdev *sdp) | ||||
881 | { | ||||
882 | struct vnode *vp; | ||||
883 | int error, npages, nblocks, size; | ||||
884 | long addr; | ||||
885 | struct vattr va; | ||||
886 | #if defined(NFSCLIENT1) | ||||
887 | extern const struct vops nfs_vops; | ||||
888 | #endif /* defined(NFSCLIENT) */ | ||||
889 | dev_t dev; | ||||
890 | |||||
891 | /* | ||||
892 | * we want to enable swapping on sdp. the swd_vp contains | ||||
893 | * the vnode we want (locked and ref'd), and the swd_dev | ||||
894 | * contains the dev_t of the file, if it a block device. | ||||
895 | */ | ||||
896 | |||||
897 | vp = sdp->swd_vp; | ||||
898 | dev = sdp->swd_devswd_se.se_dev; | ||||
899 | |||||
900 | #if NVND1 > 0 | ||||
901 | /* no swapping to vnds. */ | ||||
902 | if (bdevsw[major(dev)(((unsigned)(dev) >> 8) & 0xff)].d_strategy == vndstrategy) | ||||
903 | return (EOPNOTSUPP45); | ||||
904 | #endif | ||||
905 | |||||
906 | /* | ||||
907 | * open the swap file (mostly useful for block device files to | ||||
908 | * let device driver know what is up). | ||||
909 | * | ||||
910 | * we skip the open/close for root on swap because the root | ||||
911 | * has already been opened when root was mounted (mountroot). | ||||
912 | */ | ||||
913 | if (vp != rootvp) { | ||||
914 | if ((error = VOP_OPEN(vp, FREAD0x0001|FWRITE0x0002, p->p_ucred, p))) | ||||
915 | return (error); | ||||
916 | } | ||||
917 | |||||
918 | /* XXX this only works for block devices */ | ||||
919 | /* | ||||
920 | * we now need to determine the size of the swap area. for | ||||
921 | * block specials we can call the d_psize function. | ||||
922 | * for normal files, we must stat [get attrs]. | ||||
923 | * | ||||
924 | * we put the result in nblks. | ||||
925 | * for normal files, we also want the filesystem block size | ||||
926 | * (which we get with statfs). | ||||
927 | */ | ||||
928 | switch (vp->v_type) { | ||||
929 | case VBLK: | ||||
930 | if (bdevsw[major(dev)(((unsigned)(dev) >> 8) & 0xff)].d_psize == 0 || | ||||
931 | (nblocks = (*bdevsw[major(dev)(((unsigned)(dev) >> 8) & 0xff)].d_psize)(dev)) == -1) { | ||||
932 | error = ENXIO6; | ||||
933 | goto bad; | ||||
934 | } | ||||
935 | break; | ||||
936 | |||||
937 | case VREG: | ||||
938 | if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) | ||||
939 | goto bad; | ||||
940 | nblocks = (int)btodb(va.va_size)((va.va_size) >> 9); | ||||
941 | if ((error = | ||||
942 | VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)(*(vp->v_mount)->mnt_op->vfs_statfs)(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) | ||||
943 | goto bad; | ||||
944 | |||||
945 | sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; | ||||
946 | /* | ||||
947 | * limit the max # of outstanding I/O requests we issue | ||||
948 | * at any one time. take it easy on NFS servers. | ||||
949 | */ | ||||
950 | #if defined(NFSCLIENT1) | ||||
951 | if (vp->v_op == &nfs_vops) | ||||
952 | sdp->swd_maxactive = 2; /* XXX */ | ||||
953 | else | ||||
954 | #endif /* defined(NFSCLIENT) */ | ||||
955 | sdp->swd_maxactive = 8; /* XXX */ | ||||
956 | bufq_init(&sdp->swd_bufq, BUFQ_FIFO0); | ||||
957 | break; | ||||
958 | |||||
959 | default: | ||||
960 | error = ENXIO6; | ||||
961 | goto bad; | ||||
962 | } | ||||
963 | |||||
964 | /* | ||||
965 | * save nblocks in a safe place and convert to pages. | ||||
966 | */ | ||||
967 | |||||
968 | sdp->swd_nblksswd_se.se_nblks = nblocks; | ||||
969 | npages = dbtob((u_int64_t)nblocks)(((u_int64_t)nblocks) << 9) >> PAGE_SHIFT12; | ||||
970 | |||||
971 | /* | ||||
972 | * for block special files, we want to make sure that leave | ||||
973 | * the disklabel and bootblocks alone, so we arrange to skip | ||||
974 | * over them (arbitrarily choosing to skip PAGE_SIZE bytes). | ||||
975 | * note that because of this the "size" can be less than the | ||||
976 | * actual number of blocks on the device. | ||||
977 | */ | ||||
978 | if (vp->v_type == VBLK) { | ||||
979 | /* we use pages 1 to (size - 1) [inclusive] */ | ||||
980 | size = npages - 1; | ||||
981 | addr = 1; | ||||
982 | } else { | ||||
983 | /* we use pages 0 to (size - 1) [inclusive] */ | ||||
984 | size = npages; | ||||
985 | addr = 0; | ||||
986 | } | ||||
987 | |||||
988 | /* | ||||
989 | * make sure we have enough blocks for a reasonable sized swap | ||||
990 | * area. we want at least one page. | ||||
991 | */ | ||||
992 | |||||
993 | if (size < 1) { | ||||
994 | error = EINVAL22; | ||||
995 | goto bad; | ||||
996 | } | ||||
997 | |||||
998 | /* | ||||
999 | * now we need to allocate a blist to manage this swap device | ||||
1000 | */ | ||||
1001 | sdp->swd_blist = blist_create(npages); | ||||
1002 | /* mark all expect the `saved' region free. */ | ||||
1003 | blist_free(sdp->swd_blist, addr, size); | ||||
1004 | |||||
1005 | #ifdef HIBERNATE1 | ||||
1006 | /* | ||||
1007 | * Lock down the last region of primary disk swap, in case | ||||
1008 | * hibernate needs to place a signature there. | ||||
1009 | */ | ||||
1010 | if (dev == swdevt[0].sw_dev && vp->v_type == VBLK && size > 3 ) { | ||||
1011 | if (blist_fill(sdp->swd_blist, npages - 1, 1) != 1) | ||||
1012 | panic("hibernate reserve"); | ||||
1013 | } | ||||
1014 | #endif | ||||
1015 | |||||
1016 | /* add a ref to vp to reflect usage as a swap device. */ | ||||
1017 | vref(vp); | ||||
1018 | |||||
1019 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
1020 | if (uvm_doswapencrypt) | ||||
1021 | uvm_swap_initcrypt(sdp, npages); | ||||
1022 | #endif | ||||
1023 | /* now add the new swapdev to the drum and enable. */ | ||||
1024 | swapdrum_add(sdp, npages); | ||||
1025 | sdp->swd_npages = size; | ||||
1026 | mtx_enter(&uvm_swap_data_lock); | ||||
1027 | sdp->swd_flagsswd_se.se_flags &= ~SWF_FAKE0x00000008; /* going live */ | ||||
1028 | sdp->swd_flagsswd_se.se_flags |= (SWF_INUSE0x00000001|SWF_ENABLE0x00000002); | ||||
1029 | uvmexp.swpages += size; | ||||
1030 | mtx_leave(&uvm_swap_data_lock); | ||||
1031 | return (0); | ||||
1032 | |||||
1033 | /* | ||||
1034 | * failure: clean up and return error. | ||||
1035 | */ | ||||
1036 | |||||
1037 | bad: | ||||
1038 | if (vp != rootvp) | ||||
1039 | (void)VOP_CLOSE(vp, FREAD0x0001|FWRITE0x0002, p->p_ucred, p); | ||||
1040 | return (error); | ||||
1041 | } | ||||
1042 | |||||
1043 | /* | ||||
1044 | * swap_off: stop swapping on swapdev | ||||
1045 | * | ||||
1046 | * => swap data should be locked, we will unlock. | ||||
1047 | */ | ||||
1048 | int | ||||
1049 | swap_off(struct proc *p, struct swapdev *sdp) | ||||
1050 | { | ||||
1051 | int npages = sdp->swd_npages; | ||||
1052 | int error = 0; | ||||
1053 | |||||
1054 | KASSERT(rw_write_held(&swap_syscall_lock))((rw_write_held(&swap_syscall_lock)) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c", 1054, "rw_write_held(&swap_syscall_lock)" )); | ||||
1055 | MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock)do { if (((&uvm_swap_data_lock)->mtx_owner != ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci ) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci; })) && !(panicstr || db_active)) panic("mutex %p not held in %s" , (&uvm_swap_data_lock), __func__); } while (0); | ||||
1056 | |||||
1057 | /* disable the swap area being removed */ | ||||
1058 | sdp->swd_flagsswd_se.se_flags &= ~SWF_ENABLE0x00000002; | ||||
1059 | mtx_leave(&uvm_swap_data_lock); | ||||
1060 | |||||
1061 | /* | ||||
1062 | * the idea is to find all the pages that are paged out to this | ||||
1063 | * device, and page them all in. in uvm, swap-backed pageable | ||||
1064 | * memory can take two forms: aobjs and anons. call the | ||||
1065 | * swapoff hook for each subsystem to bring in pages. | ||||
1066 | */ | ||||
1067 | |||||
1068 | if (uao_swap_off(sdp->swd_drumoffset, | ||||
1069 | sdp->swd_drumoffset + sdp->swd_drumsize) || | ||||
1070 | amap_swap_off(sdp->swd_drumoffset, | ||||
1071 | sdp->swd_drumoffset + sdp->swd_drumsize)) { | ||||
1072 | error = ENOMEM12; | ||||
1073 | } else if (sdp->swd_npginuse > sdp->swd_npgbad) { | ||||
1074 | error = EBUSY16; | ||||
1075 | } | ||||
1076 | |||||
1077 | if (error) { | ||||
1078 | mtx_enter(&uvm_swap_data_lock); | ||||
1079 | sdp->swd_flagsswd_se.se_flags |= SWF_ENABLE0x00000002; | ||||
1080 | mtx_leave(&uvm_swap_data_lock); | ||||
1081 | return error; | ||||
1082 | } | ||||
1083 | |||||
1084 | /* | ||||
1085 | * done with the vnode and saved creds. | ||||
1086 | * drop our ref on the vnode before calling VOP_CLOSE() | ||||
1087 | * so that spec_close() can tell if this is the last close. | ||||
1088 | */ | ||||
1089 | if (sdp->swd_vp->v_type == VREG) { | ||||
1090 | crfree(sdp->swd_cred); | ||||
1091 | } | ||||
1092 | vrele(sdp->swd_vp); | ||||
1093 | if (sdp->swd_vp != rootvp) { | ||||
1094 | (void) VOP_CLOSE(sdp->swd_vp, FREAD0x0001|FWRITE0x0002, p->p_ucred, p); | ||||
1095 | } | ||||
1096 | |||||
1097 | mtx_enter(&uvm_swap_data_lock); | ||||
1098 | uvmexp.swpages -= npages; | ||||
1099 | |||||
1100 | if (swaplist_find(sdp->swd_vp, 1) == NULL((void *)0)) | ||||
1101 | panic("swap_off: swapdev not in list"); | ||||
1102 | swaplist_trim(); | ||||
1103 | mtx_leave(&uvm_swap_data_lock); | ||||
1104 | |||||
1105 | /* | ||||
1106 | * free all resources! | ||||
1107 | */ | ||||
1108 | extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize, | ||||
1109 | EX_WAITOK0x0001); | ||||
1110 | blist_destroy(sdp->swd_blist); | ||||
1111 | /* free sdp->swd_path ? */ | ||||
1112 | free(sdp, M_VMSWAP92, sizeof(*sdp)); | ||||
1113 | return (0); | ||||
1114 | } | ||||
1115 | |||||
1116 | /* | ||||
1117 | * /dev/drum interface and i/o functions | ||||
1118 | */ | ||||
1119 | |||||
1120 | /* | ||||
1121 | * swstrategy: perform I/O on the drum | ||||
1122 | * | ||||
1123 | * => we must map the i/o request from the drum to the correct swapdev. | ||||
1124 | */ | ||||
1125 | void | ||||
1126 | swstrategy(struct buf *bp) | ||||
1127 | { | ||||
1128 | struct swapdev *sdp; | ||||
1129 | int s, pageno, bn; | ||||
1130 | |||||
1131 | /* | ||||
1132 | * convert block number to swapdev. note that swapdev can't | ||||
1133 | * be yanked out from under us because we are holding resources | ||||
1134 | * in it (i.e. the blocks we are doing I/O on). | ||||
1135 | */ | ||||
1136 | pageno = dbtob((u_int64_t)bp->b_blkno)(((u_int64_t)bp->b_blkno) << 9) >> PAGE_SHIFT12; | ||||
1137 | mtx_enter(&uvm_swap_data_lock); | ||||
1138 | sdp = swapdrum_getsdp(pageno); | ||||
1139 | mtx_leave(&uvm_swap_data_lock); | ||||
1140 | if (sdp == NULL((void *)0)) { | ||||
1141 | bp->b_error = EINVAL22; | ||||
1142 | bp->b_flags |= B_ERROR0x00000400; | ||||
1143 | s = splbio()splraise(0x3); | ||||
1144 | biodone(bp); | ||||
1145 | splx(s)spllower(s); | ||||
1146 | return; | ||||
1147 | } | ||||
1148 | |||||
1149 | /* convert drum page number to block number on this swapdev. */ | ||||
1150 | pageno -= sdp->swd_drumoffset; /* page # on swapdev */ | ||||
1151 | bn = btodb((u_int64_t)pageno << PAGE_SHIFT)(((u_int64_t)pageno << 12) >> 9); /* convert to diskblock */ | ||||
1152 | |||||
1153 | /* | ||||
1154 | * for block devices we finish up here. | ||||
1155 | * for regular files we have to do more work which we delegate | ||||
1156 | * to sw_reg_strategy(). | ||||
1157 | */ | ||||
1158 | switch (sdp->swd_vp->v_type) { | ||||
1159 | default: | ||||
1160 | panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); | ||||
1161 | case VBLK: | ||||
1162 | /* | ||||
1163 | * must convert "bp" from an I/O on /dev/drum to an I/O | ||||
1164 | * on the swapdev (sdp). | ||||
1165 | */ | ||||
1166 | s = splbio()splraise(0x3); | ||||
1167 | buf_replacevnode(bp, sdp->swd_vp); | ||||
1168 | |||||
1169 | bp->b_blkno = bn; | ||||
1170 | splx(s)spllower(s); | ||||
1171 | VOP_STRATEGY(bp->b_vp, bp); | ||||
1172 | return; | ||||
1173 | case VREG: | ||||
1174 | /* delegate to sw_reg_strategy function. */ | ||||
1175 | sw_reg_strategy(sdp, bp, bn); | ||||
1176 | return; | ||||
1177 | } | ||||
1178 | /* NOTREACHED */ | ||||
1179 | } | ||||
1180 | |||||
1181 | /* | ||||
1182 | * sw_reg_strategy: handle swap i/o to regular files | ||||
1183 | */ | ||||
1184 | void | ||||
1185 | sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) | ||||
1186 | { | ||||
1187 | struct vnode *vp; | ||||
1188 | struct vndxfer *vnx; | ||||
1189 | daddr_t nbn; | ||||
1190 | caddr_t addr; | ||||
1191 | off_t byteoff; | ||||
1192 | int s, off, nra, error, sz, resid; | ||||
1193 | |||||
1194 | /* | ||||
1195 | * allocate a vndxfer head for this transfer and point it to | ||||
1196 | * our buffer. | ||||
1197 | */ | ||||
1198 | vnx = pool_get(&vndxfer_pool, PR_WAITOK0x0001); | ||||
1199 | vnx->vx_flags = VX_BUSY1; | ||||
1200 | vnx->vx_error = 0; | ||||
1201 | vnx->vx_pending = 0; | ||||
1202 | vnx->vx_bp = bp; | ||||
1203 | vnx->vx_sdp = sdp; | ||||
1204 | |||||
1205 | /* | ||||
1206 | * setup for main loop where we read filesystem blocks into | ||||
1207 | * our buffer. | ||||
1208 | */ | ||||
1209 | error = 0; | ||||
1210 | bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ | ||||
1211 | addr = bp->b_data; /* current position in buffer */ | ||||
1212 | byteoff = dbtob((u_int64_t)bn)(((u_int64_t)bn) << 9); | ||||
1213 | |||||
1214 | for (resid = bp->b_resid; resid; resid -= sz) { | ||||
1215 | struct vndbuf *nbp; | ||||
1216 | /* | ||||
1217 | * translate byteoffset into block number. return values: | ||||
1218 | * vp = vnode of underlying device | ||||
1219 | * nbn = new block number (on underlying vnode dev) | ||||
1220 | * nra = num blocks we can read-ahead (excludes requested | ||||
1221 | * block) | ||||
1222 | */ | ||||
1223 | nra = 0; | ||||
1224 | error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, | ||||
1225 | &vp, &nbn, &nra); | ||||
1226 | |||||
1227 | if (error == 0 && nbn == -1) { | ||||
1228 | /* | ||||
1229 | * this used to just set error, but that doesn't | ||||
1230 | * do the right thing. Instead, it causes random | ||||
1231 | * memory errors. The panic() should remain until | ||||
1232 | * this condition doesn't destabilize the system. | ||||
1233 | */ | ||||
1234 | #if 1 | ||||
1235 | panic("sw_reg_strategy: swap to sparse file"); | ||||
1236 | #else | ||||
1237 | error = EIO5; /* failure */ | ||||
1238 | #endif | ||||
1239 | } | ||||
1240 | |||||
1241 | /* | ||||
1242 | * punt if there was an error or a hole in the file. | ||||
1243 | * we must wait for any i/o ops we have already started | ||||
1244 | * to finish before returning. | ||||
1245 | * | ||||
1246 | * XXX we could deal with holes here but it would be | ||||
1247 | * a hassle (in the write case). | ||||
1248 | */ | ||||
1249 | if (error) { | ||||
1250 | s = splbio()splraise(0x3); | ||||
1251 | vnx->vx_error = error; /* pass error up */ | ||||
1252 | goto out; | ||||
1253 | } | ||||
1254 | |||||
1255 | /* | ||||
1256 | * compute the size ("sz") of this transfer (in bytes). | ||||
1257 | */ | ||||
1258 | off = byteoff % sdp->swd_bsize; | ||||
1259 | sz = (1 + nra) * sdp->swd_bsize - off; | ||||
1260 | if (sz > resid) | ||||
1261 | sz = resid; | ||||
1262 | |||||
1263 | /* | ||||
1264 | * now get a buf structure. note that the vb_buf is | ||||
1265 | * at the front of the nbp structure so that you can | ||||
1266 | * cast pointers between the two structure easily. | ||||
1267 | */ | ||||
1268 | nbp = pool_get(&vndbuf_pool, PR_WAITOK0x0001); | ||||
1269 | nbp->vb_buf.b_flags = bp->b_flags | B_CALL0x00000040; | ||||
1270 | nbp->vb_buf.b_bcount = sz; | ||||
1271 | nbp->vb_buf.b_bufsize = sz; | ||||
1272 | nbp->vb_buf.b_error = 0; | ||||
1273 | nbp->vb_buf.b_data = addr; | ||||
1274 | nbp->vb_buf.b_bq = NULL((void *)0); | ||||
1275 | nbp->vb_buf.b_blkno = nbn + btodb(off)((off) >> 9); | ||||
1276 | nbp->vb_buf.b_proc = bp->b_proc; | ||||
1277 | nbp->vb_buf.b_iodone = sw_reg_iodone; | ||||
1278 | nbp->vb_buf.b_vp = NULLVP((struct vnode *)((void *)0)); | ||||
1279 | nbp->vb_buf.b_vnbufs.le_next = NOLIST((struct buf *)0x87654321); | ||||
1280 | LIST_INIT(&nbp->vb_buf.b_dep)do { ((&nbp->vb_buf.b_dep)->lh_first) = ((void *)0) ; } while (0); | ||||
1281 | |||||
1282 | /* | ||||
1283 | * set b_dirtyoff/end and b_validoff/end. this is | ||||
1284 | * required by the NFS client code (otherwise it will | ||||
1285 | * just discard our I/O request). | ||||
1286 | */ | ||||
1287 | if (bp->b_dirtyend == 0) { | ||||
1288 | nbp->vb_buf.b_dirtyoff = 0; | ||||
1289 | nbp->vb_buf.b_dirtyend = sz; | ||||
1290 | } else { | ||||
1291 | nbp->vb_buf.b_dirtyoff = | ||||
1292 | max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); | ||||
1293 | nbp->vb_buf.b_dirtyend = | ||||
1294 | min(sz, | ||||
1295 | max(0, bp->b_dirtyend - (bp->b_bcount-resid))); | ||||
1296 | } | ||||
1297 | if (bp->b_validend == 0) { | ||||
1298 | nbp->vb_buf.b_validoff = 0; | ||||
1299 | nbp->vb_buf.b_validend = sz; | ||||
1300 | } else { | ||||
1301 | nbp->vb_buf.b_validoff = | ||||
1302 | max(0, bp->b_validoff - (bp->b_bcount-resid)); | ||||
1303 | nbp->vb_buf.b_validend = | ||||
1304 | min(sz, | ||||
1305 | max(0, bp->b_validend - (bp->b_bcount-resid))); | ||||
1306 | } | ||||
1307 | |||||
1308 | /* patch it back to the vnx */ | ||||
1309 | nbp->vb_vnx = vnx; | ||||
1310 | task_set(&nbp->vb_task, sw_reg_iodone_internal, nbp); | ||||
1311 | |||||
1312 | s = splbio()splraise(0x3); | ||||
1313 | if (vnx->vx_error != 0) { | ||||
1314 | pool_put(&vndbuf_pool, nbp); | ||||
1315 | goto out; | ||||
1316 | } | ||||
1317 | vnx->vx_pending++; | ||||
1318 | |||||
1319 | /* assoc new buffer with underlying vnode */ | ||||
1320 | bgetvp(vp, &nbp->vb_buf); | ||||
1321 | |||||
1322 | /* start I/O if we are not over our limit */ | ||||
1323 | bufq_queue(&sdp->swd_bufq, &nbp->vb_buf); | ||||
1324 | sw_reg_start(sdp); | ||||
1325 | splx(s)spllower(s); | ||||
1326 | |||||
1327 | /* | ||||
1328 | * advance to the next I/O | ||||
1329 | */ | ||||
1330 | byteoff += sz; | ||||
1331 | addr += sz; | ||||
1332 | } | ||||
1333 | |||||
1334 | s = splbio()splraise(0x3); | ||||
1335 | |||||
1336 | out: /* Arrive here at splbio */ | ||||
1337 | vnx->vx_flags &= ~VX_BUSY1; | ||||
1338 | if (vnx->vx_pending == 0) { | ||||
1339 | if (vnx->vx_error != 0) { | ||||
1340 | bp->b_error = vnx->vx_error; | ||||
1341 | bp->b_flags |= B_ERROR0x00000400; | ||||
1342 | } | ||||
1343 | pool_put(&vndxfer_pool, vnx); | ||||
1344 | biodone(bp); | ||||
1345 | } | ||||
1346 | splx(s)spllower(s); | ||||
1347 | } | ||||
1348 | |||||
1349 | /* sw_reg_start: start an I/O request on the requested swapdev. */ | ||||
1350 | void | ||||
1351 | sw_reg_start(struct swapdev *sdp) | ||||
1352 | { | ||||
1353 | struct buf *bp; | ||||
1354 | |||||
1355 | /* XXX: recursion control */ | ||||
1356 | if ((sdp->swd_flagsswd_se.se_flags & SWF_BUSY0x00000004) != 0) | ||||
1357 | return; | ||||
1358 | |||||
1359 | sdp->swd_flagsswd_se.se_flags |= SWF_BUSY0x00000004; | ||||
1360 | |||||
1361 | while (sdp->swd_active < sdp->swd_maxactive) { | ||||
1362 | bp = bufq_dequeue(&sdp->swd_bufq); | ||||
1363 | if (bp == NULL((void *)0)) | ||||
1364 | break; | ||||
1365 | |||||
1366 | sdp->swd_active++; | ||||
1367 | |||||
1368 | if ((bp->b_flags & B_READ0x00008000) == 0) | ||||
1369 | bp->b_vp->v_numoutput++; | ||||
1370 | |||||
1371 | VOP_STRATEGY(bp->b_vp, bp); | ||||
1372 | } | ||||
1373 | sdp->swd_flagsswd_se.se_flags &= ~SWF_BUSY0x00000004; | ||||
1374 | } | ||||
1375 | |||||
1376 | /* | ||||
1377 | * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup | ||||
1378 | * | ||||
1379 | * => note that we can recover the vndbuf struct by casting the buf ptr | ||||
1380 | * | ||||
1381 | * XXX: | ||||
1382 | * We only put this onto a taskq here, because of the maxactive game since | ||||
1383 | * it basically requires us to call back into VOP_STRATEGY() (where we must | ||||
1384 | * be able to sleep) via sw_reg_start(). | ||||
1385 | */ | ||||
1386 | void | ||||
1387 | sw_reg_iodone(struct buf *bp) | ||||
1388 | { | ||||
1389 | struct vndbuf *vbp = (struct vndbuf *)bp; | ||||
1390 | task_add(systq, &vbp->vb_task); | ||||
1391 | } | ||||
1392 | |||||
1393 | void | ||||
1394 | sw_reg_iodone_internal(void *xvbp) | ||||
1395 | { | ||||
1396 | struct vndbuf *vbp = xvbp; | ||||
1397 | struct vndxfer *vnx = vbp->vb_vnx; | ||||
1398 | struct buf *pbp = vnx->vx_bp; /* parent buffer */ | ||||
1399 | struct swapdev *sdp = vnx->vx_sdp; | ||||
1400 | int resid, s; | ||||
1401 | |||||
1402 | s = splbio()splraise(0x3); | ||||
1403 | |||||
1404 | resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; | ||||
1405 | pbp->b_resid -= resid; | ||||
1406 | vnx->vx_pending--; | ||||
1407 | |||||
1408 | /* pass error upward */ | ||||
1409 | if (vbp->vb_buf.b_error) | ||||
1410 | vnx->vx_error = vbp->vb_buf.b_error; | ||||
1411 | |||||
1412 | /* disassociate this buffer from the vnode (if any). */ | ||||
1413 | if (vbp->vb_buf.b_vp != NULL((void *)0)) { | ||||
1414 | brelvp(&vbp->vb_buf); | ||||
1415 | } | ||||
1416 | |||||
1417 | /* kill vbp structure */ | ||||
1418 | pool_put(&vndbuf_pool, vbp); | ||||
1419 | |||||
1420 | /* | ||||
1421 | * wrap up this transaction if it has run to completion or, in | ||||
1422 | * case of an error, when all auxiliary buffers have returned. | ||||
1423 | */ | ||||
1424 | if (vnx->vx_error != 0) { | ||||
1425 | /* pass error upward */ | ||||
1426 | pbp->b_flags |= B_ERROR0x00000400; | ||||
1427 | pbp->b_error = vnx->vx_error; | ||||
1428 | if ((vnx->vx_flags & VX_BUSY1) == 0 && vnx->vx_pending == 0) { | ||||
1429 | pool_put(&vndxfer_pool, vnx); | ||||
1430 | biodone(pbp); | ||||
1431 | } | ||||
1432 | } else if (pbp->b_resid == 0) { | ||||
1433 | KASSERT(vnx->vx_pending == 0)((vnx->vx_pending == 0) ? (void)0 : __assert("diagnostic " , "/usr/src/sys/uvm/uvm_swap.c", 1433, "vnx->vx_pending == 0" )); | ||||
1434 | if ((vnx->vx_flags & VX_BUSY1) == 0) { | ||||
1435 | pool_put(&vndxfer_pool, vnx); | ||||
1436 | biodone(pbp); | ||||
1437 | } | ||||
1438 | } | ||||
1439 | |||||
1440 | /* | ||||
1441 | * done! start next swapdev I/O if one is pending | ||||
1442 | */ | ||||
1443 | sdp->swd_active--; | ||||
1444 | sw_reg_start(sdp); | ||||
1445 | splx(s)spllower(s); | ||||
1446 | } | ||||
1447 | |||||
1448 | |||||
1449 | /* | ||||
1450 | * uvm_swap_alloc: allocate space on swap | ||||
1451 | * | ||||
1452 | * => allocation is done "round robin" down the priority list, as we | ||||
1453 | * allocate in a priority we "rotate" the tail queue. | ||||
1454 | * => space can be freed with uvm_swap_free | ||||
1455 | * => we return the page slot number in /dev/drum (0 == invalid slot) | ||||
1456 | * => we lock uvm_swap_data_lock | ||||
1457 | * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM | ||||
1458 | */ | ||||
1459 | int | ||||
1460 | uvm_swap_alloc(int *nslots, boolean_t lessok) | ||||
1461 | { | ||||
1462 | struct swapdev *sdp; | ||||
1463 | struct swappri *spp; | ||||
1464 | |||||
1465 | /* | ||||
1466 | * no swap devices configured yet? definite failure. | ||||
1467 | */ | ||||
1468 | if (uvmexp.nswapdev < 1) | ||||
1469 | return 0; | ||||
1470 | |||||
1471 | /* | ||||
1472 | * lock data lock, convert slots into blocks, and enter loop | ||||
1473 | */ | ||||
1474 | KERNEL_ASSERT_LOCKED()((_kernel_lock_held()) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c" , 1474, "_kernel_lock_held()")); | ||||
1475 | mtx_enter(&uvm_swap_data_lock); | ||||
1476 | |||||
1477 | ReTry: /* XXXMRG */ | ||||
1478 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
1479 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
1480 | swblk_t result; | ||||
1481 | |||||
1482 | /* if it's not enabled, then we can't swap from it */ | ||||
1483 | if ((sdp->swd_flagsswd_se.se_flags & SWF_ENABLE0x00000002) == 0) | ||||
1484 | continue; | ||||
1485 | if (sdp->swd_npginuse + *nslots > sdp->swd_npages) | ||||
1486 | continue; | ||||
1487 | result = blist_alloc(sdp->swd_blist, *nslots); | ||||
1488 | if (result == SWAPBLK_NONE((swblk_t)-1)) { | ||||
1489 | continue; | ||||
1490 | } | ||||
1491 | KASSERT(result < sdp->swd_drumsize)((result < sdp->swd_drumsize) ? (void)0 : __assert("diagnostic " , "/usr/src/sys/uvm/uvm_swap.c", 1491, "result < sdp->swd_drumsize" )); | ||||
1492 | |||||
1493 | /* | ||||
1494 | * successful allocation! now rotate the tailq. | ||||
1495 | */ | ||||
1496 | TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next)do { if (((sdp)->swd_next.tqe_next) != ((void *)0)) (sdp)-> swd_next.tqe_next->swd_next.tqe_prev = (sdp)->swd_next. tqe_prev; else (&spp->spi_swapdev)->tqh_last = (sdp )->swd_next.tqe_prev; *(sdp)->swd_next.tqe_prev = (sdp) ->swd_next.tqe_next; ((sdp)->swd_next.tqe_prev) = ((void *)-1); ((sdp)->swd_next.tqe_next) = ((void *)-1); } while (0); | ||||
1497 | TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next)do { (sdp)->swd_next.tqe_next = ((void *)0); (sdp)->swd_next .tqe_prev = (&spp->spi_swapdev)->tqh_last; *(&spp ->spi_swapdev)->tqh_last = (sdp); (&spp->spi_swapdev )->tqh_last = &(sdp)->swd_next.tqe_next; } while (0 ); | ||||
1498 | sdp->swd_npginuse += *nslots; | ||||
1499 | uvmexp.swpginuse += *nslots; | ||||
1500 | mtx_leave(&uvm_swap_data_lock); | ||||
1501 | /* done! return drum slot number */ | ||||
1502 | return result + sdp->swd_drumoffset; | ||||
1503 | } | ||||
1504 | } | ||||
1505 | |||||
1506 | /* XXXMRG: BEGIN HACK */ | ||||
1507 | if (*nslots > 1 && lessok) { | ||||
1508 | *nslots = 1; | ||||
1509 | /* XXXMRG: ugh! blist should support this for us */ | ||||
1510 | goto ReTry; | ||||
1511 | } | ||||
1512 | /* XXXMRG: END HACK */ | ||||
1513 | |||||
1514 | mtx_leave(&uvm_swap_data_lock); | ||||
1515 | return 0; /* failed */ | ||||
1516 | } | ||||
1517 | |||||
1518 | /* | ||||
1519 | * uvm_swapisfilled: return true if the amount of free space in swap is | ||||
1520 | * smaller than the size of a cluster. | ||||
1521 | * | ||||
1522 | * As long as some swap slots are being used by pages currently in memory, | ||||
1523 | * it is possible to reuse them. Even if the swap space has been completly | ||||
1524 | * filled we do not consider it full. | ||||
1525 | */ | ||||
1526 | int | ||||
1527 | uvm_swapisfilled(void) | ||||
1528 | { | ||||
1529 | int result; | ||||
1530 | |||||
1531 | mtx_enter(&uvm_swap_data_lock); | ||||
1532 | KASSERT(uvmexp.swpginuse <= uvmexp.swpages)((uvmexp.swpginuse <= uvmexp.swpages) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c", 1532, "uvmexp.swpginuse <= uvmexp.swpages" )); | ||||
1533 | result = (uvmexp.swpginuse + SWCLUSTPAGES((64 * 1024) >> 12)) >= uvmexp.swpages; | ||||
1534 | mtx_leave(&uvm_swap_data_lock); | ||||
1535 | |||||
1536 | return result; | ||||
1537 | } | ||||
1538 | |||||
1539 | /* | ||||
1540 | * uvm_swapisfull: return true if the amount of pages only in swap | ||||
1541 | * accounts for more than 99% of the total swap space. | ||||
1542 | * | ||||
1543 | */ | ||||
1544 | int | ||||
1545 | uvm_swapisfull(void) | ||||
1546 | { | ||||
1547 | int result; | ||||
1548 | |||||
1549 | mtx_enter(&uvm_swap_data_lock); | ||||
1550 | KASSERT(uvmexp.swpgonly <= uvmexp.swpages)((uvmexp.swpgonly <= uvmexp.swpages) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/uvm/uvm_swap.c", 1550, "uvmexp.swpgonly <= uvmexp.swpages" )); | ||||
1551 | result = (uvmexp.swpgonly >= ((long)uvmexp.swpages * 99 / 100)); | ||||
1552 | mtx_leave(&uvm_swap_data_lock); | ||||
1553 | |||||
1554 | return result; | ||||
1555 | } | ||||
1556 | |||||
1557 | /* | ||||
1558 | * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors | ||||
1559 | * | ||||
1560 | * => we lock uvm_swap_data_lock | ||||
1561 | */ | ||||
1562 | void | ||||
1563 | uvm_swap_markbad(int startslot, int nslots) | ||||
1564 | { | ||||
1565 | struct swapdev *sdp; | ||||
1566 | |||||
1567 | mtx_enter(&uvm_swap_data_lock); | ||||
1568 | sdp = swapdrum_getsdp(startslot); | ||||
1569 | if (sdp != NULL((void *)0)) { | ||||
1570 | /* | ||||
1571 | * we just keep track of how many pages have been marked bad | ||||
1572 | * in this device, to make everything add up in swap_off(). | ||||
1573 | * we assume here that the range of slots will all be within | ||||
1574 | * one swap device. | ||||
1575 | */ | ||||
1576 | sdp->swd_npgbad += nslots; | ||||
1577 | } | ||||
1578 | mtx_leave(&uvm_swap_data_lock); | ||||
1579 | } | ||||
1580 | |||||
1581 | /* | ||||
1582 | * uvm_swap_free: free swap slots | ||||
1583 | * | ||||
1584 | * => this can be all or part of an allocation made by uvm_swap_alloc | ||||
1585 | * => we lock uvm_swap_data_lock | ||||
1586 | */ | ||||
1587 | void | ||||
1588 | uvm_swap_free(int startslot, int nslots) | ||||
1589 | { | ||||
1590 | struct swapdev *sdp; | ||||
1591 | |||||
1592 | /* | ||||
1593 | * ignore attempts to free the "bad" slot. | ||||
1594 | */ | ||||
1595 | |||||
1596 | if (startslot == SWSLOT_BAD(-1)) { | ||||
1597 | return; | ||||
1598 | } | ||||
1599 | |||||
1600 | /* | ||||
1601 | * convert drum slot offset back to sdp, free the blocks | ||||
1602 | * in the extent, and return. must hold pri lock to do | ||||
1603 | * lookup and access the extent. | ||||
1604 | */ | ||||
1605 | KERNEL_LOCK()_kernel_lock(); | ||||
1606 | mtx_enter(&uvm_swap_data_lock); | ||||
1607 | sdp = swapdrum_getsdp(startslot); | ||||
1608 | KASSERT(uvmexp.nswapdev >= 1)((uvmexp.nswapdev >= 1) ? (void)0 : __assert("diagnostic " , "/usr/src/sys/uvm/uvm_swap.c", 1608, "uvmexp.nswapdev >= 1" )); | ||||
1609 | KASSERT(sdp != NULL)((sdp != ((void *)0)) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c" , 1609, "sdp != NULL")); | ||||
1610 | KASSERT(sdp->swd_npginuse >= nslots)((sdp->swd_npginuse >= nslots) ? (void)0 : __assert("diagnostic " , "/usr/src/sys/uvm/uvm_swap.c", 1610, "sdp->swd_npginuse >= nslots" )); | ||||
1611 | blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); | ||||
1612 | sdp->swd_npginuse -= nslots; | ||||
1613 | uvmexp.swpginuse -= nslots; | ||||
1614 | mtx_leave(&uvm_swap_data_lock); | ||||
1615 | |||||
1616 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
1617 | { | ||||
1618 | int i; | ||||
1619 | if (swap_encrypt_initialized) { | ||||
1620 | /* Dereference keys */ | ||||
1621 | for (i = 0; i < nslots; i++) | ||||
1622 | if (uvm_swap_needdecrypt(sdp, startslot + i)) { | ||||
1623 | struct swap_key *key; | ||||
1624 | |||||
1625 | key = SWD_KEY(sdp, startslot + i)&((sdp)->swd_keys[((startslot + i) - (sdp)->swd_drumoffset ) >> 7]); | ||||
1626 | if (key->refcount != 0) | ||||
1627 | SWAP_KEY_PUT(sdp, key)do { (key)->refcount--; if ((key)->refcount == 0) { swap_key_delete (key); } } while(0);; | ||||
1628 | } | ||||
1629 | |||||
1630 | /* Mark range as not decrypt */ | ||||
1631 | uvm_swap_markdecrypt(sdp, startslot, nslots, 0); | ||||
1632 | } | ||||
1633 | } | ||||
1634 | #endif /* UVM_SWAP_ENCRYPT */ | ||||
1635 | KERNEL_UNLOCK()_kernel_unlock(); | ||||
1636 | } | ||||
1637 | |||||
1638 | /* | ||||
1639 | * uvm_swap_put: put any number of pages into a contig place on swap | ||||
1640 | * | ||||
1641 | * => can be sync or async | ||||
1642 | */ | ||||
1643 | int | ||||
1644 | uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) | ||||
1645 | { | ||||
1646 | int result; | ||||
1647 | |||||
1648 | result = uvm_swap_io(ppsp, swslot, npages, B_WRITE0x00000000 | | ||||
1649 | ((flags & PGO_SYNCIO0x002) ? 0 : B_ASYNC0x00000004)); | ||||
1650 | |||||
1651 | return (result); | ||||
1652 | } | ||||
1653 | |||||
1654 | /* | ||||
1655 | * uvm_swap_get: get a single page from swap | ||||
1656 | * | ||||
1657 | * => usually a sync op (from fault) | ||||
1658 | */ | ||||
1659 | int | ||||
1660 | uvm_swap_get(struct vm_page *page, int swslot, int flags) | ||||
1661 | { | ||||
1662 | int result; | ||||
1663 | |||||
1664 | atomic_inc_int(&uvmexp.nswget)_atomic_inc_int(&uvmexp.nswget); | ||||
1665 | KASSERT(flags & PGO_SYNCIO)((flags & 0x002) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c" , 1665, "flags & PGO_SYNCIO")); | ||||
1666 | if (swslot == SWSLOT_BAD(-1)) { | ||||
1667 | return VM_PAGER_ERROR4; | ||||
1668 | } | ||||
1669 | |||||
1670 | KERNEL_LOCK()_kernel_lock(); | ||||
1671 | result = uvm_swap_io(&page, swslot, 1, B_READ0x00008000); | ||||
1672 | KERNEL_UNLOCK()_kernel_unlock(); | ||||
1673 | |||||
1674 | if (result == VM_PAGER_OK0 || result == VM_PAGER_PEND3) { | ||||
1675 | /* | ||||
1676 | * this page is no longer only in swap. | ||||
1677 | */ | ||||
1678 | atomic_dec_int(&uvmexp.swpgonly)_atomic_dec_int(&uvmexp.swpgonly); | ||||
1679 | } | ||||
1680 | return (result); | ||||
1681 | } | ||||
1682 | |||||
1683 | /* | ||||
1684 | * uvm_swap_io: do an i/o operation to swap | ||||
1685 | */ | ||||
1686 | |||||
1687 | int | ||||
1688 | uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) | ||||
1689 | { | ||||
1690 | daddr_t startblk; | ||||
1691 | struct buf *bp; | ||||
1692 | vaddr_t kva; | ||||
1693 | int result, s, mapinflags, pflag, bounce = 0, i; | ||||
1694 | boolean_t write, async; | ||||
1695 | vaddr_t bouncekva; | ||||
1696 | struct vm_page *tpps[SWCLUSTPAGES((64 * 1024) >> 12)]; | ||||
1697 | int pdaemon = (curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc == uvm.pagedaemon_proc); | ||||
1698 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
1699 | struct swapdev *sdp; | ||||
| |||||
1700 | int encrypt = 0; | ||||
1701 | #endif | ||||
1702 | |||||
1703 | KERNEL_ASSERT_LOCKED()((_kernel_lock_held()) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/uvm/uvm_swap.c" , 1703, "_kernel_lock_held()")); | ||||
1704 | |||||
1705 | write = (flags & B_READ0x00008000) == 0; | ||||
1706 | async = (flags & B_ASYNC0x00000004) != 0; | ||||
1707 | |||||
1708 | /* convert starting drum slot to block number */ | ||||
1709 | startblk = btodb((u_int64_t)startslot << PAGE_SHIFT)(((u_int64_t)startslot << 12) >> 9); | ||||
1710 | |||||
1711 | pflag = (async
| ||||
1712 | bp = pool_get(&bufpool, pflag | PR_ZERO0x0008); | ||||
1713 | if (bp == NULL((void *)0)) | ||||
1714 | return (VM_PAGER_AGAIN5); | ||||
1715 | |||||
1716 | /* | ||||
1717 | * map the pages into the kernel (XXX: currently required | ||||
1718 | * by buffer system). | ||||
1719 | */ | ||||
1720 | mapinflags = !write
| ||||
1721 | if (!async
| ||||
1722 | mapinflags |= UVMPAGER_MAPIN_WAITOK0x01; | ||||
1723 | kva = uvm_pagermapin(pps, npages, mapinflags); | ||||
1724 | if (kva == 0) { | ||||
1725 | pool_put(&bufpool, bp); | ||||
1726 | return (VM_PAGER_AGAIN5); | ||||
1727 | } | ||||
1728 | |||||
1729 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
1730 | if (write
| ||||
1731 | /* | ||||
1732 | * Check if we need to do swap encryption on old pages. | ||||
1733 | * Later we need a different scheme, that swap encrypts | ||||
1734 | * all pages of a process that had at least one page swap | ||||
1735 | * encrypted. Then we might not need to copy all pages | ||||
1736 | * in the cluster, and avoid the memory overheard in | ||||
1737 | * swapping. | ||||
1738 | */ | ||||
1739 | if (uvm_doswapencrypt) | ||||
1740 | encrypt = 1; | ||||
1741 | } | ||||
1742 | |||||
1743 | if (swap_encrypt_initialized || encrypt
| ||||
1744 | /* | ||||
1745 | * we need to know the swap device that we are swapping to/from | ||||
1746 | * to see if the pages need to be marked for decryption or | ||||
1747 | * actually need to be decrypted. | ||||
1748 | * XXX - does this information stay the same over the whole | ||||
1749 | * execution of this function? | ||||
1750 | */ | ||||
1751 | mtx_enter(&uvm_swap_data_lock); | ||||
1752 | sdp = swapdrum_getsdp(startslot); | ||||
1753 | mtx_leave(&uvm_swap_data_lock); | ||||
1754 | } | ||||
1755 | |||||
1756 | /* | ||||
1757 | * Check that we are dma capable for read (write always bounces | ||||
1758 | * through the swapencrypt anyway... | ||||
1759 | */ | ||||
1760 | if (write
| ||||
1761 | bounce = 1; /* bounce through swapencrypt always */ | ||||
1762 | } else { | ||||
1763 | #else | ||||
1764 | { | ||||
1765 | #endif | ||||
1766 | |||||
1767 | for (i = 0; i < npages; i++) { | ||||
1768 | if (VM_PAGE_TO_PHYS(pps[i])((pps[i])->phys_addr) < dma_constraint.ucr_low || | ||||
1769 | VM_PAGE_TO_PHYS(pps[i])((pps[i])->phys_addr) > dma_constraint.ucr_high) { | ||||
1770 | bounce = 1; | ||||
1771 | break; | ||||
1772 | } | ||||
1773 | } | ||||
1774 | } | ||||
1775 | |||||
1776 | if (bounce
| ||||
1777 | int swmapflags, plaflags; | ||||
1778 | |||||
1779 | /* We always need write access. */ | ||||
1780 | swmapflags = UVMPAGER_MAPIN_READ0x02; | ||||
1781 | plaflags = UVM_PLA_NOWAIT0x0002; | ||||
1782 | if (!async) { | ||||
1783 | swmapflags |= UVMPAGER_MAPIN_WAITOK0x01; | ||||
1784 | plaflags = UVM_PLA_WAITOK0x0001; | ||||
1785 | } | ||||
1786 | if (uvm_swap_allocpages(tpps, npages, plaflags)) { | ||||
1787 | pool_put(&bufpool, bp); | ||||
1788 | uvm_pagermapout(kva, npages); | ||||
1789 | return (VM_PAGER_AGAIN5); | ||||
1790 | } | ||||
1791 | |||||
1792 | bouncekva = uvm_pagermapin(tpps, npages, swmapflags); | ||||
1793 | if (bouncekva == 0) { | ||||
1794 | pool_put(&bufpool, bp); | ||||
1795 | uvm_pagermapout(kva, npages); | ||||
1796 | uvm_swap_freepages(tpps, npages); | ||||
1797 | return (VM_PAGER_AGAIN5); | ||||
1798 | } | ||||
1799 | } | ||||
1800 | |||||
1801 | /* encrypt to swap */ | ||||
1802 | if (write
| ||||
1803 | int i, opages; | ||||
1804 | caddr_t src, dst; | ||||
1805 | u_int64_t block; | ||||
1806 | |||||
1807 | src = (caddr_t) kva; | ||||
1808 | dst = (caddr_t) bouncekva; | ||||
1809 | block = startblk; | ||||
1810 | for (i = 0; i < npages; i++) { | ||||
1811 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
1812 | struct swap_key *key; | ||||
1813 | |||||
1814 | if (encrypt) { | ||||
1815 | key = SWD_KEY(sdp, startslot + i)&((sdp)->swd_keys[((startslot + i) - (sdp)->swd_drumoffset ) >> 7]); | ||||
1816 | SWAP_KEY_GET(sdp, key)do { if ((key)->refcount == 0) { swap_key_create(key); } ( key)->refcount++; } while(0);; /* add reference */ | ||||
1817 | |||||
1818 | swap_encrypt(key, src, dst, block, PAGE_SIZE(1 << 12)); | ||||
1819 | block += btodb(PAGE_SIZE)(((1 << 12)) >> 9); | ||||
1820 | } else { | ||||
1821 | #else | ||||
1822 | { | ||||
1823 | #endif /* UVM_SWAP_ENCRYPT */ | ||||
1824 | memcpy(dst, src, PAGE_SIZE)__builtin_memcpy((dst), (src), ((1 << 12))); | ||||
1825 | } | ||||
1826 | /* this just tells async callbacks to free */ | ||||
1827 | atomic_setbits_intx86_atomic_setbits_u32(&tpps[i]->pg_flags, PQ_ENCRYPT0x00400000); | ||||
1828 | src += PAGE_SIZE(1 << 12); | ||||
1829 | dst += PAGE_SIZE(1 << 12); | ||||
1830 | } | ||||
1831 | |||||
1832 | uvm_pagermapout(kva, npages); | ||||
1833 | |||||
1834 | /* dispose of pages we dont use anymore */ | ||||
1835 | opages = npages; | ||||
1836 | uvm_pager_dropcluster(NULL((void *)0), NULL((void *)0), pps, &opages, | ||||
1837 | PGO_PDFREECLUST0x080); | ||||
1838 | |||||
1839 | kva = bouncekva; | ||||
1840 | } | ||||
1841 | |||||
1842 | /* | ||||
1843 | * prevent ASYNC reads. | ||||
1844 | * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get | ||||
1845 | * assumes that all gets are SYNCIO. Just make sure here. | ||||
1846 | * XXXARTUBC - might not be true anymore. | ||||
1847 | */ | ||||
1848 | if (!write
| ||||
1849 | flags &= ~B_ASYNC0x00000004; | ||||
1850 | async = 0; | ||||
1851 | } | ||||
1852 | |||||
1853 | /* | ||||
1854 | * fill in the bp. we currently route our i/o through | ||||
1855 | * /dev/drum's vnode [swapdev_vp]. | ||||
1856 | */ | ||||
1857 | bp->b_flags = B_BUSY0x00000010 | B_NOCACHE0x00001000 | B_RAW0x00004000 | (flags & (B_READ0x00008000|B_ASYNC0x00000004)); | ||||
1858 | bp->b_proc = &proc0; /* XXX */ | ||||
1859 | bp->b_vnbufs.le_next = NOLIST((struct buf *)0x87654321); | ||||
1860 | if (bounce
| ||||
1861 | bp->b_data = (caddr_t)bouncekva; | ||||
1862 | else | ||||
1863 | bp->b_data = (caddr_t)kva; | ||||
1864 | bp->b_bq = NULL((void *)0); | ||||
1865 | bp->b_blkno = startblk; | ||||
1866 | LIST_INIT(&bp->b_dep)do { ((&bp->b_dep)->lh_first) = ((void *)0); } while (0); | ||||
1867 | s = splbio()splraise(0x3); | ||||
1868 | bp->b_vp = NULL((void *)0); | ||||
1869 | buf_replacevnode(bp, swapdev_vp); | ||||
1870 | splx(s)spllower(s); | ||||
1871 | bp->b_bufsize = bp->b_bcount = (long)npages << PAGE_SHIFT12; | ||||
1872 | |||||
1873 | /* | ||||
1874 | * for pageouts we must set "dirtyoff" [NFS client code needs it]. | ||||
1875 | * and we bump v_numoutput (counter of number of active outputs). | ||||
1876 | */ | ||||
1877 | if (write
| ||||
1878 | bp->b_dirtyoff = 0; | ||||
1879 | bp->b_dirtyend = npages << PAGE_SHIFT12; | ||||
1880 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
1881 | /* mark the pages in the drum for decryption */ | ||||
1882 | if (swap_encrypt_initialized) | ||||
1883 | uvm_swap_markdecrypt(sdp, startslot, npages, encrypt); | ||||
| |||||
1884 | #endif | ||||
1885 | s = splbio()splraise(0x3); | ||||
1886 | swapdev_vp->v_numoutput++; | ||||
1887 | splx(s)spllower(s); | ||||
1888 | } | ||||
1889 | |||||
1890 | /* for async ops we must set up the iodone handler. */ | ||||
1891 | if (async) { | ||||
1892 | bp->b_flags |= B_CALL0x00000040 | (pdaemon ? B_PDAEMON0x00200000 : 0); | ||||
1893 | bp->b_iodone = uvm_aio_biodone; | ||||
1894 | } | ||||
1895 | |||||
1896 | /* now we start the I/O, and if async, return. */ | ||||
1897 | VOP_STRATEGY(bp->b_vp, bp); | ||||
1898 | if (async) | ||||
1899 | return (VM_PAGER_PEND3); | ||||
1900 | |||||
1901 | /* must be sync i/o. wait for it to finish */ | ||||
1902 | (void) biowait(bp); | ||||
1903 | result = (bp->b_flags & B_ERROR0x00000400) ? VM_PAGER_ERROR4 : VM_PAGER_OK0; | ||||
1904 | |||||
1905 | /* decrypt swap */ | ||||
1906 | if (!write && !(bp->b_flags & B_ERROR0x00000400)) { | ||||
1907 | int i; | ||||
1908 | caddr_t data = (caddr_t)kva; | ||||
1909 | caddr_t dst = (caddr_t)kva; | ||||
1910 | u_int64_t block = startblk; | ||||
1911 | |||||
1912 | if (bounce) | ||||
1913 | data = (caddr_t)bouncekva; | ||||
1914 | |||||
1915 | for (i = 0; i < npages; i++) { | ||||
1916 | #ifdef UVM_SWAP_ENCRYPT1 | ||||
1917 | struct swap_key *key; | ||||
1918 | |||||
1919 | /* Check if we need to decrypt */ | ||||
1920 | if (swap_encrypt_initialized && | ||||
1921 | uvm_swap_needdecrypt(sdp, startslot + i)) { | ||||
1922 | key = SWD_KEY(sdp, startslot + i)&((sdp)->swd_keys[((startslot + i) - (sdp)->swd_drumoffset ) >> 7]); | ||||
1923 | if (key->refcount == 0) { | ||||
1924 | result = VM_PAGER_ERROR4; | ||||
1925 | break; | ||||
1926 | } | ||||
1927 | swap_decrypt(key, data, dst, block, PAGE_SIZE(1 << 12)); | ||||
1928 | } else if (bounce) { | ||||
1929 | #else | ||||
1930 | if (bounce) { | ||||
1931 | #endif | ||||
1932 | memcpy(dst, data, PAGE_SIZE)__builtin_memcpy((dst), (data), ((1 << 12))); | ||||
1933 | } | ||||
1934 | data += PAGE_SIZE(1 << 12); | ||||
1935 | dst += PAGE_SIZE(1 << 12); | ||||
1936 | block += btodb(PAGE_SIZE)(((1 << 12)) >> 9); | ||||
1937 | } | ||||
1938 | if (bounce) | ||||
1939 | uvm_pagermapout(bouncekva, npages); | ||||
1940 | } | ||||
1941 | /* kill the pager mapping */ | ||||
1942 | uvm_pagermapout(kva, npages); | ||||
1943 | |||||
1944 | /* Not anymore needed, free after encryption/bouncing */ | ||||
1945 | if (!write && bounce) | ||||
1946 | uvm_swap_freepages(tpps, npages); | ||||
1947 | |||||
1948 | /* now dispose of the buf */ | ||||
1949 | s = splbio()splraise(0x3); | ||||
1950 | if (bp->b_vp) | ||||
1951 | brelvp(bp); | ||||
1952 | |||||
1953 | if (write && bp->b_vp) | ||||
1954 | vwakeup(bp->b_vp); | ||||
1955 | pool_put(&bufpool, bp); | ||||
1956 | splx(s)spllower(s); | ||||
1957 | |||||
1958 | /* finally return. */ | ||||
1959 | return (result); | ||||
1960 | } | ||||
1961 | |||||
1962 | void | ||||
1963 | swapmount(void) | ||||
1964 | { | ||||
1965 | struct swapdev *sdp; | ||||
1966 | struct swappri *spp; | ||||
1967 | struct vnode *vp; | ||||
1968 | dev_t swap_dev = swdevt[0].sw_dev; | ||||
1969 | char *nam; | ||||
1970 | char path[MNAMELEN90 + 1]; | ||||
1971 | |||||
1972 | if (swap_dev == NODEV(dev_t)(-1)) | ||||
1973 | return; | ||||
1974 | |||||
1975 | rw_enter_write(&swap_syscall_lock); | ||||
1976 | |||||
1977 | #if defined(NFSCLIENT1) | ||||
1978 | if (swap_dev == NETDEV(dev_t)(-2)) { | ||||
1979 | extern struct nfs_diskless nfs_diskless; | ||||
1980 | |||||
1981 | snprintf(path, sizeof(path), "%s", | ||||
1982 | nfs_diskless.nd_swap.ndm_host); | ||||
1983 | vp = nfs_diskless.sw_vp; | ||||
1984 | goto gotit; | ||||
1985 | } else | ||||
1986 | #endif | ||||
1987 | if (bdevvp(swap_dev, &vp)) { | ||||
1988 | rw_exit_write(&swap_syscall_lock); | ||||
1989 | return; | ||||
1990 | } | ||||
1991 | |||||
1992 | /* Construct a potential path to swap */ | ||||
1993 | if ((nam = findblkname(major(swap_dev)(((unsigned)(swap_dev) >> 8) & 0xff)))) | ||||
1994 | snprintf(path, sizeof(path), "/dev/%s%d%c", nam, | ||||
1995 | DISKUNIT(swap_dev)(((unsigned)((swap_dev) & 0xff) | (((swap_dev) & 0xffff0000 ) >> 8)) / 16), 'a' + DISKPART(swap_dev)(((unsigned)((swap_dev) & 0xff) | (((swap_dev) & 0xffff0000 ) >> 8)) % 16)); | ||||
1996 | else | ||||
1997 | snprintf(path, sizeof(path), "blkdev0x%x", | ||||
1998 | swap_dev); | ||||
1999 | |||||
2000 | #if defined(NFSCLIENT1) | ||||
2001 | gotit: | ||||
2002 | #endif | ||||
2003 | sdp = malloc(sizeof(*sdp), M_VMSWAP92, M_WAITOK0x0001|M_ZERO0x0008); | ||||
2004 | spp = malloc(sizeof(*spp), M_VMSWAP92, M_WAITOK0x0001); | ||||
2005 | |||||
2006 | sdp->swd_flagsswd_se.se_flags = SWF_FAKE0x00000008; | ||||
2007 | sdp->swd_devswd_se.se_dev = swap_dev; | ||||
2008 | |||||
2009 | sdp->swd_pathlen = strlen(path) + 1; | ||||
2010 | sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP92, M_WAITOK0x0001 | M_ZERO0x0008); | ||||
2011 | strlcpy(sdp->swd_path, path, sdp->swd_pathlen); | ||||
2012 | |||||
2013 | sdp->swd_vp = vp; | ||||
2014 | |||||
2015 | mtx_enter(&uvm_swap_data_lock); | ||||
2016 | swaplist_insert(sdp, spp, 0); | ||||
2017 | mtx_leave(&uvm_swap_data_lock); | ||||
2018 | |||||
2019 | if (swap_on(curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc, sdp)) { | ||||
2020 | mtx_enter(&uvm_swap_data_lock); | ||||
2021 | swaplist_find(vp, 1); | ||||
2022 | swaplist_trim(); | ||||
2023 | vput(sdp->swd_vp); | ||||
2024 | mtx_leave(&uvm_swap_data_lock); | ||||
2025 | rw_exit_write(&swap_syscall_lock); | ||||
2026 | free(sdp->swd_path, M_VMSWAP92, sdp->swd_pathlen); | ||||
2027 | free(sdp, M_VMSWAP92, sizeof(*sdp)); | ||||
2028 | return; | ||||
2029 | } | ||||
2030 | rw_exit_write(&swap_syscall_lock); | ||||
2031 | } | ||||
2032 | |||||
2033 | #ifdef HIBERNATE1 | ||||
2034 | int | ||||
2035 | uvm_hibswap(dev_t dev, u_long *sp, u_long *ep) | ||||
2036 | { | ||||
2037 | struct swapdev *sdp, *swd = NULL((void *)0); | ||||
2038 | struct swappri *spp; | ||||
2039 | |||||
2040 | /* no swap devices configured yet? */ | ||||
2041 | if (uvmexp.nswapdev < 1 || dev != swdevt[0].sw_dev) | ||||
2042 | return (1); | ||||
2043 | |||||
2044 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
2045 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
2046 | if (sdp->swd_devswd_se.se_dev == dev) | ||||
2047 | swd = sdp; | ||||
2048 | } | ||||
2049 | } | ||||
2050 | |||||
2051 | if (swd == NULL((void *)0) || (swd->swd_flagsswd_se.se_flags & SWF_ENABLE0x00000002) == 0) | ||||
2052 | return (1); | ||||
2053 | |||||
2054 | blist_gapfind(swd->swd_blist, sp, ep); | ||||
2055 | |||||
2056 | if (*ep - *sp == 0) | ||||
2057 | /* no gap found */ | ||||
2058 | return (1); | ||||
2059 | |||||
2060 | /* | ||||
2061 | * blist_gapfind returns the gap as [sp,ep[ , | ||||
2062 | * whereas [sp,ep] is expected from uvm_hibswap(). | ||||
2063 | */ | ||||
2064 | *ep -= 1; | ||||
2065 | |||||
2066 | return (0); | ||||
2067 | } | ||||
2068 | #endif /* HIBERNATE */ | ||||
2069 | |||||
2070 | #ifdef DDB1 | ||||
2071 | void | ||||
2072 | swap_print_all(int (*pr)(const char *, ...)) | ||||
2073 | { | ||||
2074 | struct swappri *spp; | ||||
2075 | struct swapdev *sdp; | ||||
2076 | |||||
2077 | LIST_FOREACH(spp, &swap_priority, spi_swappri)for((spp) = ((&swap_priority)->lh_first); (spp)!= ((void *)0); (spp) = ((spp)->spi_swappri.le_next)) { | ||||
2078 | TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)for((sdp) = ((&spp->spi_swapdev)->tqh_first); (sdp) != ((void *)0); (sdp) = ((sdp)->swd_next.tqe_next)) { | ||||
2079 | #ifdef HIBERNATE1 | ||||
2080 | u_long bgap = 0, egap = 0; | ||||
2081 | #endif | ||||
2082 | |||||
2083 | pr("swap %p path \"%s\" flags 0x%x\n", sdp, | ||||
2084 | sdp->swd_path, sdp->swd_flagsswd_se.se_flags); | ||||
2085 | |||||
2086 | blist_print(sdp->swd_blist); | ||||
2087 | |||||
2088 | #ifdef HIBERNATE1 | ||||
2089 | if (!uvm_hibswap(sdp->swd_devswd_se.se_dev, &bgap, &egap)) | ||||
2090 | pr("hibernate gap: [0x%lx, 0x%lx] size=%lu\n", | ||||
2091 | bgap, egap, (egap - bgap + 1)); | ||||
2092 | else | ||||
2093 | pr("hibernate gap: not found\n"); | ||||
2094 | #endif | ||||
2095 | } | ||||
2096 | } | ||||
2097 | } | ||||
2098 | #endif /* DDB */ |