Bug Summary

File:kern/vfs_bio.c
Warning:line 749, column 2
Access to field 'v_numoutput' results in a dereference of a null pointer (loaded from field 'b_vp')

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.4 -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name vfs_bio.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model static -mframe-pointer=all -relaxed-aliasing -ffp-contract=on -fno-rounding-math -mconstructor-aliases -ffreestanding -mcmodel=kernel -target-cpu x86-64 -target-feature +retpoline-indirect-calls -target-feature +retpoline-indirect-branches -target-feature -sse2 -target-feature -sse -target-feature -3dnow -target-feature -mmx -target-feature +save-args -target-feature +retpoline-external-thunk -disable-red-zone -no-implicit-float -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/sys/arch/amd64/compile/GENERIC.MP/obj -nostdsysteminc -nobuiltininc -resource-dir /usr/local/llvm16/lib/clang/16 -I /usr/src/sys -I /usr/src/sys/arch/amd64/compile/GENERIC.MP/obj -I /usr/src/sys/arch -I /usr/src/sys/dev/pci/drm/include -I /usr/src/sys/dev/pci/drm/include/uapi -I /usr/src/sys/dev/pci/drm/amd/include/asic_reg -I /usr/src/sys/dev/pci/drm/amd/include -I /usr/src/sys/dev/pci/drm/amd/amdgpu -I /usr/src/sys/dev/pci/drm/amd/display -I /usr/src/sys/dev/pci/drm/amd/display/include -I /usr/src/sys/dev/pci/drm/amd/display/dc -I /usr/src/sys/dev/pci/drm/amd/display/amdgpu_dm -I /usr/src/sys/dev/pci/drm/amd/pm/inc -I /usr/src/sys/dev/pci/drm/amd/pm/legacy-dpm -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/inc -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/smu11 -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/smu12 -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/smu13 -I /usr/src/sys/dev/pci/drm/amd/pm/powerplay/inc -I /usr/src/sys/dev/pci/drm/amd/pm/powerplay/hwmgr -I /usr/src/sys/dev/pci/drm/amd/pm/powerplay/smumgr -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/inc -I /usr/src/sys/dev/pci/drm/amd/pm/swsmu/inc/pmfw_if -I /usr/src/sys/dev/pci/drm/amd/display/dc/inc -I /usr/src/sys/dev/pci/drm/amd/display/dc/inc/hw -I /usr/src/sys/dev/pci/drm/amd/display/dc/clk_mgr -I /usr/src/sys/dev/pci/drm/amd/display/modules/inc -I /usr/src/sys/dev/pci/drm/amd/display/modules/hdcp -I /usr/src/sys/dev/pci/drm/amd/display/dmub/inc -I /usr/src/sys/dev/pci/drm/i915 -D DDB -D DIAGNOSTIC -D KTRACE -D ACCOUNTING -D KMEMSTATS -D PTRACE -D POOL_DEBUG -D CRYPTO -D SYSVMSG -D SYSVSEM -D SYSVSHM -D UVM_SWAP_ENCRYPT -D FFS -D FFS2 -D FFS_SOFTUPDATES -D UFS_DIRHASH -D QUOTA -D EXT2FS -D MFS -D NFSCLIENT -D NFSSERVER -D CD9660 -D UDF -D MSDOSFS -D FIFO -D FUSE -D SOCKET_SPLICE -D TCP_ECN -D TCP_SIGNATURE -D INET6 -D IPSEC -D PPP_BSDCOMP -D PPP_DEFLATE -D PIPEX -D MROUTING -D MPLS -D BOOT_CONFIG -D USER_PCICONF -D APERTURE -D MTRR -D NTFS -D SUSPEND -D HIBERNATE -D PCIVERBOSE -D USBVERBOSE -D WSDISPLAY_COMPAT_USL -D WSDISPLAY_COMPAT_RAWKBD -D WSDISPLAY_DEFAULTSCREENS=6 -D X86EMU -D ONEWIREVERBOSE -D MULTIPROCESSOR -D MAXUSERS=80 -D _KERNEL -O2 -Wno-pointer-sign -Wno-address-of-packed-member -Wno-constant-conversion -Wno-unused-but-set-variable -Wno-gnu-folding-constant -fdebug-compilation-dir=/usr/src/sys/arch/amd64/compile/GENERIC.MP/obj -ferror-limit 19 -fwrapv -D_RET_PROTECTOR -ret-protector -fcf-protection=branch -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -o /home/ben/Projects/scan/2024-01-11-110808-61670-1 -x c /usr/src/sys/kern/vfs_bio.c
1/* $OpenBSD: vfs_bio.c,v 1.212 2023/04/26 15:13:52 beck Exp $ */
2/* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */
3
4/*
5 * Copyright (c) 1994 Christopher G. Demetriou
6 * Copyright (c) 1982, 1986, 1989, 1993
7 * The Regents of the University of California. All rights reserved.
8 * (c) UNIX System Laboratories, Inc.
9 * All or some portions of this file are derived from material licensed
10 * to the University of California by American Telephone and Telegraph
11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12 * the permission of UNIX System Laboratories, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
39 */
40
41/*
42 * Some references:
43 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
44 * Leffler, et al.: The Design and Implementation of the 4.3BSD
45 * UNIX Operating System (Addison Welley, 1989)
46 */
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/proc.h>
51#include <sys/buf.h>
52#include <sys/vnode.h>
53#include <sys/mount.h>
54#include <sys/malloc.h>
55#include <sys/pool.h>
56#include <sys/specdev.h>
57#include <sys/tracepoint.h>
58#include <uvm/uvm_extern.h>
59
60/* XXX Should really be in buf.h, but for uvm_constraint_range.. */
61int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int);
62
63struct uvm_constraint_range high_constraint;
64int fliphigh;
65
66int nobuffers;
67int needbuffer;
68struct bio_ops bioops;
69
70/* private bufcache functions */
71void bufcache_init(void);
72void bufcache_adjust(void);
73struct buf *bufcache_gethighcleanbuf(void);
74struct buf *bufcache_getdmacleanbuf(void);
75
76/*
77 * Buffer pool for I/O buffers.
78 */
79struct pool bufpool;
80struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead){ ((void *)0) };
81void buf_put(struct buf *);
82
83struct buf *bio_doread(struct vnode *, daddr_t, int, int);
84struct buf *buf_get(struct vnode *, daddr_t, size_t);
85void bread_cluster_callback(struct buf *);
86int64_t bufcache_recover_dmapages(int discard, int64_t howmany);
87static struct buf *incore_locked(struct vnode *vp, daddr_t blkno);
88
89struct bcachestats bcstats; /* counters */
90long lodirtypages; /* dirty page count low water mark */
91long hidirtypages; /* dirty page count high water mark */
92long targetpages; /* target number of pages for cache size */
93long buflowpages; /* smallest size cache allowed */
94long bufhighpages; /* largest size cache allowed */
95long bufbackpages; /* minimum number of pages we shrink when asked to */
96
97vsize_t bufkvm;
98
99struct proc *cleanerproc;
100int bd_req; /* Sleep point for cleaner daemon. */
101
102#define NUM_CACHES2 2
103#define DMA_CACHE0 0
104struct bufcache cleancache[NUM_CACHES2];
105struct bufqueue dirtyqueue;
106
107void
108buf_put(struct buf *bp)
109{
110 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
111
112#ifdef DIAGNOSTIC1
113 if (bp->b_pobj != NULL((void *)0))
114 KASSERT(bp->b_bufsize > 0)((bp->b_bufsize > 0) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 114, "bp->b_bufsize > 0"
))
;
115 if (ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080)))
116 panic("buf_put: releasing dirty buffer");
117 if (bp->b_freelist.tqe_next != NOLIST((struct buf *)0x87654321) &&
118 bp->b_freelist.tqe_next != (void *)-1)
119 panic("buf_put: still on the free list");
120 if (bp->b_vnbufs.le_next != NOLIST((struct buf *)0x87654321) &&
121 bp->b_vnbufs.le_next != (void *)-1)
122 panic("buf_put: still on the vnode list");
123 if (!LIST_EMPTY(&bp->b_dep)(((&bp->b_dep)->lh_first) == ((void *)0)))
124 panic("buf_put: b_dep is not empty");
125#endif
126
127 LIST_REMOVE(bp, b_list)do { if ((bp)->b_list.le_next != ((void *)0)) (bp)->b_list
.le_next->b_list.le_prev = (bp)->b_list.le_prev; *(bp)->
b_list.le_prev = (bp)->b_list.le_next; ((bp)->b_list.le_prev
) = ((void *)-1); ((bp)->b_list.le_next) = ((void *)-1); }
while (0)
;
128 bcstats.numbufs--;
129
130 if (buf_dealloc_mem(bp) != 0)
131 return;
132 pool_put(&bufpool, bp);
133}
134
135/*
136 * Initialize buffers and hash links for buffers.
137 */
138void
139bufinit(void)
140{
141 u_int64_t dmapages;
142 u_int64_t highpages;
143
144 dmapages = uvm_pagecount(&dma_constraint);
145 /* take away a guess at how much of this the kernel will consume */
146 dmapages -= (atop(physmem)((physmem) >> 12) - atop(uvmexp.free)((uvmexp.free) >> 12));
147
148 /* See if we have memory above the dma accessible region. */
149 high_constraint.ucr_low = dma_constraint.ucr_high;
150 high_constraint.ucr_high = no_constraint.ucr_high;
151 if (high_constraint.ucr_low != high_constraint.ucr_high)
152 high_constraint.ucr_low++;
153 highpages = uvm_pagecount(&high_constraint);
154
155 /*
156 * Do we have any significant amount of high memory above
157 * the DMA region? if so enable moving buffers there, if not,
158 * don't bother.
159 */
160 if (highpages > dmapages / 4)
161 fliphigh = 1;
162 else
163 fliphigh = 0;
164
165 /*
166 * If MD code doesn't say otherwise, use up to 10% of DMA'able
167 * memory for buffers.
168 */
169 if (bufcachepercent == 0)
170 bufcachepercent = 10;
171
172 /*
173 * XXX these values and their same use in kern_sysctl
174 * need to move into buf.h
175 */
176 KASSERT(bufcachepercent <= 90)((bufcachepercent <= 90) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 176, "bufcachepercent <= 90"
))
;
177 KASSERT(bufcachepercent >= 5)((bufcachepercent >= 5) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 177, "bufcachepercent >= 5"
))
;
178 if (bufpages == 0)
179 bufpages = dmapages * bufcachepercent / 100;
180 if (bufpages < BCACHE_MIN((4 * (64 * 1024) / (1 << 12)) * 2))
181 bufpages = BCACHE_MIN((4 * (64 * 1024) / (1 << 12)) * 2);
182 KASSERT(bufpages < dmapages)((bufpages < dmapages) ? (void)0 : __assert("diagnostic ",
"/usr/src/sys/kern/vfs_bio.c", 182, "bufpages < dmapages"
))
;
183
184 bufhighpages = bufpages;
185
186 /*
187 * Set the base backoff level for the buffer cache. We will
188 * not allow uvm to steal back more than this number of pages.
189 */
190 buflowpages = dmapages * 5 / 100;
191 if (buflowpages < BCACHE_MIN((4 * (64 * 1024) / (1 << 12)) * 2))
192 buflowpages = BCACHE_MIN((4 * (64 * 1024) / (1 << 12)) * 2);
193
194 /*
195 * set bufbackpages to 100 pages, or 10 percent of the low water mark
196 * if we don't have that many pages.
197 */
198
199 bufbackpages = buflowpages * 10 / 100;
200 if (bufbackpages > 100)
201 bufbackpages = 100;
202
203 /*
204 * If the MD code does not say otherwise, reserve 10% of kva
205 * space for mapping buffers.
206 */
207 if (bufkvm == 0)
208 bufkvm = VM_KERNEL_SPACE_SIZE(0xffff800100000000 - 0xffff800000000000) / 10;
209
210 /*
211 * Don't use more than twice the amount of bufpages for mappings.
212 * It's twice since we map things sparsely.
213 */
214 if (bufkvm > bufpages * PAGE_SIZE(1 << 12))
215 bufkvm = bufpages * PAGE_SIZE(1 << 12);
216 /*
217 * Round bufkvm to MAXPHYS because we allocate chunks of va space
218 * in MAXPHYS chunks.
219 */
220 bufkvm &= ~(MAXPHYS(64 * 1024) - 1);
221
222 pool_init(&bufpool, sizeof(struct buf), 0, IPL_BIO0x3, 0, "bufpl", NULL((void *)0));
223
224 bufcache_init();
225
226 /*
227 * hmm - bufkvm is an argument because it's static, while
228 * bufpages is global because it can change while running.
229 */
230 buf_mem_init(bufkvm);
231
232 /*
233 * Set the dirty page high water mark to be less than the low
234 * water mark for pages in the buffer cache. This ensures we
235 * can always back off by throwing away clean pages, and give
236 * ourselves a chance to write out the dirty pages eventually.
237 */
238 hidirtypages = (buflowpages / 4) * 3;
239 lodirtypages = buflowpages / 2;
240
241 /*
242 * We are allowed to use up to the reserve.
243 */
244 targetpages = bufpages - RESERVE_PAGES(4 * (64 * 1024) / (1 << 12));
245}
246
247/*
248 * Change cachepct
249 */
250void
251bufadjust(int newbufpages)
252{
253 int s;
254 int64_t npages;
255
256 if (newbufpages < buflowpages)
257 newbufpages = buflowpages;
258
259 s = splbio()splraise(0x3);
260 bufpages = newbufpages;
261
262 /*
263 * We are allowed to use up to the reserve
264 */
265 targetpages = bufpages - RESERVE_PAGES(4 * (64 * 1024) / (1 << 12));
266
267 npages = bcstats.dmapages - targetpages;
268
269 /*
270 * Shrinking the cache happens here only if someone has manually
271 * adjusted bufcachepercent - or the pagedaemon has told us
272 * to give back memory *now* - so we give it all back.
273 */
274 if (bcstats.dmapages > targetpages)
275 (void) bufcache_recover_dmapages(0, bcstats.dmapages - targetpages);
276 bufcache_adjust();
277
278 /*
279 * Wake up the cleaner if we have lots of dirty pages,
280 * or if we are getting low on buffer cache kva.
281 */
282 if ((UNCLEAN_PAGES(bcstats.numbufpages - bcstats.numcleanpages) >= hidirtypages) ||
283 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS4)
284 wakeup(&bd_req);
285
286 splx(s)spllower(s);
287}
288
289/*
290 * Make the buffer cache back off from cachepct.
291 */
292int
293bufbackoff(struct uvm_constraint_range *range, long size)
294{
295 /*
296 * Back off "size" buffer cache pages. Called by the page
297 * daemon to consume buffer cache pages rather than scanning.
298 *
299 * It returns 0 to the pagedaemon to indicate that it has
300 * succeeded in freeing enough pages. It returns -1 to
301 * indicate that it could not and the pagedaemon should take
302 * other measures.
303 *
304 */
305 long pdelta, oldbufpages;
306
307 /*
308 * If we will accept high memory for this backoff
309 * try to steal it from the high memory buffer cache.
310 */
311 if (range != NULL((void *)0) && range->ucr_high > dma_constraint.ucr_high) {
312 struct buf *bp;
313 int64_t start = bcstats.numbufpages, recovered = 0;
314 int s = splbio()splraise(0x3);
315
316 while ((recovered < size) &&
317 (bp = bufcache_gethighcleanbuf())) {
318 bufcache_take(bp);
319 if (bp->b_vp) {
320 RBT_REMOVE(buf_rb_bufs,buf_rb_bufs_RBT_REMOVE(&bp->b_vp->v_bufs_tree, bp)
321 &bp->b_vp->v_bufs_tree, bp)buf_rb_bufs_RBT_REMOVE(&bp->b_vp->v_bufs_tree, bp);
322 brelvp(bp);
323 }
324 buf_put(bp);
325 recovered = start - bcstats.numbufpages;
326 }
327 bufcache_adjust();
328 splx(s)spllower(s);
329
330 /* If we got enough, return success */
331 if (recovered >= size)
332 return 0;
333
334 /*
335 * If we needed only memory above DMA,
336 * return failure
337 */
338 if (range->ucr_low > dma_constraint.ucr_high)
339 return -1;
340
341 /* Otherwise get the rest from DMA */
342 size -= recovered;
343 }
344
345 /*
346 * XXX Otherwise do the dma memory cache dance. this needs
347 * refactoring later to get rid of 'bufpages'
348 */
349
350 /*
351 * Back off by at least bufbackpages. If the page daemon gave us
352 * a larger size, back off by that much.
353 */
354 pdelta = (size > bufbackpages) ? size : bufbackpages;
355
356 if (bufpages <= buflowpages)
357 return(-1);
358 if (bufpages - pdelta < buflowpages)
359 pdelta = bufpages - buflowpages;
360 oldbufpages = bufpages;
361 bufadjust(bufpages - pdelta);
362 if (oldbufpages - bufpages < size)
363 return (-1); /* we did not free what we were asked */
364 else
365 return(0);
366}
367
368
369/*
370 * Opportunistically flip a buffer into high memory. Will move the buffer
371 * if memory is available without sleeping, and return 0, otherwise will
372 * fail and return -1 with the buffer unchanged.
373 */
374
375int
376buf_flip_high(struct buf *bp)
377{
378 int ret = -1;
379
380 KASSERT(ISSET(bp->b_flags, B_BC))((((bp->b_flags) & (0x02000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 380, "ISSET(bp->b_flags, B_BC)"
))
;
381 KASSERT(ISSET(bp->b_flags, B_DMA))((((bp->b_flags) & (0x04000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 381, "ISSET(bp->b_flags, B_DMA)"
))
;
382 KASSERT(bp->cache == DMA_CACHE)((bp->cache == 0) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 382, "bp->cache == DMA_CACHE"))
;
383 KASSERT(fliphigh)((fliphigh) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 383, "fliphigh"))
;
384
385 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
386
387 /* Attempt to move the buffer to high memory if we can */
388 if (buf_realloc_pages(bp, &high_constraint, UVM_PLA_NOWAIT0x0002) == 0) {
389 KASSERT(!ISSET(bp->b_flags, B_DMA))((!((bp->b_flags) & (0x04000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 389, "!ISSET(bp->b_flags, B_DMA)"
))
;
390 bcstats.highflips++;
391 ret = 0;
392 } else
393 bcstats.highflops++;
394
395 return ret;
396}
397
398/*
399 * Flip a buffer to dma reachable memory, when we need it there for
400 * I/O. This can sleep since it will wait for memory allocation in the
401 * DMA reachable area since we have to have the buffer there to proceed.
402 */
403void
404buf_flip_dma(struct buf *bp)
405{
406 KASSERT(ISSET(bp->b_flags, B_BC))((((bp->b_flags) & (0x02000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 406, "ISSET(bp->b_flags, B_BC)"
))
;
407 KASSERT(ISSET(bp->b_flags, B_BUSY))((((bp->b_flags) & (0x00000010))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 407, "ISSET(bp->b_flags, B_BUSY)"
))
;
408 KASSERT(bp->cache < NUM_CACHES)((bp->cache < 2) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 408, "bp->cache < NUM_CACHES"))
;
409
410 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
411
412 if (!ISSET(bp->b_flags, B_DMA)((bp->b_flags) & (0x04000000))) {
413
414 /* move buf to dma reachable memory */
415 (void) buf_realloc_pages(bp, &dma_constraint, UVM_PLA_WAITOK0x0001);
416 KASSERT(ISSET(bp->b_flags, B_DMA))((((bp->b_flags) & (0x04000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 416, "ISSET(bp->b_flags, B_DMA)"
))
;
417 bcstats.dmaflips++;
418 }
419
420 if (bp->cache > DMA_CACHE0) {
421 CLR(bp->b_flags, B_COLD)((bp->b_flags) &= ~(0x01000000));
422 CLR(bp->b_flags, B_WARM)((bp->b_flags) &= ~(0x00800000));
423 bp->cache = DMA_CACHE0;
424 }
425}
426
427struct buf *
428bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
429{
430 struct buf *bp;
431 struct mount *mp;
432
433 bp = getblk(vp, blkno, size, 0, INFSLP0xffffffffffffffffULL);
434
435 /*
436 * If buffer does not have valid data, start a read.
437 * Note that if buffer is B_INVAL, getblk() won't return it.
438 * Therefore, it's valid if its I/O has completed or been delayed.
439 */
440 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))((bp->b_flags) & ((0x00000100 | 0x00000080)))) {
441 SET(bp->b_flags, B_READ | async)((bp->b_flags) |= (0x00008000 | async));
442 bcstats.pendingreads++;
443 bcstats.numreads++;
444 VOP_STRATEGY(bp->b_vp, bp);
445 /* Pay for the read. */
446 curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curproc
->p_ru.ru_inblock++; /* XXX */
447 } else if (async) {
448 brelse(bp);
449 }
450
451 mp = vp->v_type == VBLK ? vp->v_specmountpointv_un.vu_specinfo->si_mountpoint : vp->v_mount;
452
453 /*
454 * Collect statistics on synchronous and asynchronous reads.
455 * Reads from block devices are charged to their associated
456 * filesystem (if any).
457 */
458 if (mp != NULL((void *)0)) {
459 if (async == 0)
460 mp->mnt_stat.f_syncreads++;
461 else
462 mp->mnt_stat.f_asyncreads++;
463 }
464
465 return (bp);
466}
467
468/*
469 * Read a disk block.
470 * This algorithm described in Bach (p.54).
471 */
472int
473bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp)
474{
475 struct buf *bp;
476
477 /* Get buffer for block. */
478 bp = *bpp = bio_doread(vp, blkno, size, 0);
479
480 /* Wait for the read to complete, and return result. */
481 return (biowait(bp));
482}
483
484/*
485 * Read-ahead multiple disk blocks. The first is sync, the rest async.
486 * Trivial modification to the breada algorithm presented in Bach (p.55).
487 */
488int
489breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[],
490 int rasizes[], int nrablks, struct buf **bpp)
491{
492 struct buf *bp;
493 int i;
494
495 bp = *bpp = bio_doread(vp, blkno, size, 0);
496
497 /*
498 * For each of the read-ahead blocks, start a read, if necessary.
499 */
500 for (i = 0; i < nrablks; i++) {
501 /* If it's in the cache, just go on to next one. */
502 if (incore(vp, rablks[i]))
503 continue;
504
505 /* Get a buffer for the read-ahead block */
506 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC0x00000004);
507 }
508
509 /* Otherwise, we had to start a read for it; wait until it's valid. */
510 return (biowait(bp));
511}
512
513/*
514 * Called from interrupt context.
515 */
516void
517bread_cluster_callback(struct buf *bp)
518{
519 struct buf **xbpp = bp->b_saveaddr;
520 int i;
521
522 if (xbpp[1] != NULL((void *)0)) {
523 size_t newsize = xbpp[1]->b_bufsize;
524
525 /*
526 * Shrink this buffer's mapping to only cover its part of
527 * the total I/O.
528 */
529 buf_fix_mapping(bp, newsize);
530 bp->b_bcount = newsize;
531 }
532
533 /* Invalidate read-ahead buffers if read short */
534 if (bp->b_resid > 0) {
535 for (i = 1; xbpp[i] != NULL((void *)0); i++)
536 continue;
537 for (i = i - 1; i != 0; i--) {
538 if (xbpp[i]->b_bufsize <= bp->b_resid) {
539 bp->b_resid -= xbpp[i]->b_bufsize;
540 SET(xbpp[i]->b_flags, B_INVAL)((xbpp[i]->b_flags) |= (0x00000800));
541 } else if (bp->b_resid > 0) {
542 bp->b_resid = 0;
543 SET(xbpp[i]->b_flags, B_INVAL)((xbpp[i]->b_flags) |= (0x00000800));
544 } else
545 break;
546 }
547 }
548
549 for (i = 1; xbpp[i] != NULL((void *)0); i++) {
550 if (ISSET(bp->b_flags, B_ERROR)((bp->b_flags) & (0x00000400)))
551 SET(xbpp[i]->b_flags, B_INVAL | B_ERROR)((xbpp[i]->b_flags) |= (0x00000800 | 0x00000400));
552 /*
553 * Move the pages from the master buffer's uvm object
554 * into the individual buffer's uvm objects.
555 */
556 struct uvm_object *newobj = &xbpp[i]->b_uobj;
557 struct uvm_object *oldobj = &bp->b_uobj;
558 int page;
559
560 uvm_obj_init(newobj, &bufcache_pager, 1);
561 for (page = 0; page < atop(xbpp[i]->b_bufsize)((xbpp[i]->b_bufsize) >> 12); page++) {
562 struct vm_page *pg = uvm_pagelookup(oldobj,
563 xbpp[i]->b_poffs + ptoa(page)((paddr_t)(page) << 12));
564 KASSERT(pg != NULL)((pg != ((void *)0)) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 564, "pg != NULL"))
;
565 KASSERT(pg->wire_count == 1)((pg->wire_count == 1) ? (void)0 : __assert("diagnostic ",
"/usr/src/sys/kern/vfs_bio.c", 565, "pg->wire_count == 1"
))
;
566 uvm_pagerealloc(pg, newobj, xbpp[i]->b_poffs + ptoa(page)((paddr_t)(page) << 12));
567 }
568 xbpp[i]->b_pobj = newobj;
569
570 biodone(xbpp[i]);
571 }
572
573 free(xbpp, M_TEMP127, (i + 1) * sizeof(*xbpp));
574
575 if (ISSET(bp->b_flags, B_ASYNC)((bp->b_flags) & (0x00000004))) {
576 brelse(bp);
577 } else {
578 CLR(bp->b_flags, B_WANTED)((bp->b_flags) &= ~(0x00010000));
579 wakeup(bp);
580 }
581}
582
583/*
584 * Read-ahead multiple disk blocks, but make sure only one (big) I/O
585 * request is sent to the disk.
586 * XXX This should probably be dropped and breadn should instead be optimized
587 * XXX to do fewer I/O requests.
588 */
589int
590bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp)
591{
592 struct buf *bp, **xbpp;
593 int howmany, maxra, i, inc;
594 daddr_t sblkno;
595
596 *rbpp = bio_doread(vp, blkno, size, 0);
597
598 /*
599 * If the buffer is in the cache skip any I/O operation.
600 */
601 if (ISSET((*rbpp)->b_flags, B_CACHE)(((*rbpp)->b_flags) & (0x00000020)))
602 goto out;
603
604 if (size != round_page(size)(((size) + ((1 << 12) - 1)) & ~((1 << 12) - 1
))
)
605 goto out;
606
607 if (VOP_BMAP(vp, blkno + 1, NULL((void *)0), &sblkno, &maxra))
608 goto out;
609
610 maxra++;
611 if (sblkno == -1 || maxra < 2)
612 goto out;
613
614 howmany = MAXPHYS(64 * 1024) / size;
615 if (howmany > maxra)
616 howmany = maxra;
617
618 xbpp = mallocarray(howmany + 1, sizeof(*xbpp), M_TEMP127, M_NOWAIT0x0002);
619 if (xbpp == NULL((void *)0))
620 goto out;
621
622 for (i = howmany - 1; i >= 0; i--) {
623 size_t sz;
624
625 /*
626 * First buffer allocates big enough size to cover what
627 * all the other buffers need.
628 */
629 sz = i == 0 ? howmany * size : 0;
630
631 xbpp[i] = buf_get(vp, blkno + i + 1, sz);
632 if (xbpp[i] == NULL((void *)0)) {
633 for (++i; i < howmany; i++) {
634 SET(xbpp[i]->b_flags, B_INVAL)((xbpp[i]->b_flags) |= (0x00000800));
635 brelse(xbpp[i]);
636 }
637 free(xbpp, M_TEMP127, (howmany + 1) * sizeof(*xbpp));
638 goto out;
639 }
640 }
641
642 bp = xbpp[0];
643
644 xbpp[howmany] = NULL((void *)0);
645
646 inc = btodb(size)((size) >> 9);
647
648 for (i = 1; i < howmany; i++) {
649 bcstats.pendingreads++;
650 bcstats.numreads++;
651 /*
652 * We set B_DMA here because bp above will be B_DMA,
653 * and we are playing buffer slice-n-dice games from
654 * the memory allocated in bp.
655 */
656 SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC)((xbpp[i]->b_flags) |= (0x04000000 | 0x00008000 | 0x00000004
))
;
657 xbpp[i]->b_blkno = sblkno + (i * inc);
658 xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
659 xbpp[i]->b_data = NULL((void *)0);
660 xbpp[i]->b_pobj = bp->b_pobj;
661 xbpp[i]->b_poffs = bp->b_poffs + (i * size);
662 }
663
664 KASSERT(bp->b_lblkno == blkno + 1)((bp->b_lblkno == blkno + 1) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 664, "bp->b_lblkno == blkno + 1"
))
;
665 KASSERT(bp->b_vp == vp)((bp->b_vp == vp) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 665, "bp->b_vp == vp"))
;
666
667 bp->b_blkno = sblkno;
668 SET(bp->b_flags, B_READ | B_ASYNC | B_CALL)((bp->b_flags) |= (0x00008000 | 0x00000004 | 0x00000040));
669
670 bp->b_saveaddr = (void *)xbpp;
671 bp->b_iodone = bread_cluster_callback;
672
673 bcstats.pendingreads++;
674 bcstats.numreads++;
675 VOP_STRATEGY(bp->b_vp, bp);
676 curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curproc
->p_ru.ru_inblock++;
677
678out:
679 return (biowait(*rbpp));
680}
681
682/*
683 * Block write. Described in Bach (p.56)
684 */
685int
686bwrite(struct buf *bp)
687{
688 int rv, async, wasdelayed, s;
689 struct vnode *vp;
690 struct mount *mp;
691
692 vp = bp->b_vp;
693 if (vp != NULL((void *)0))
1
Assuming 'vp' is equal to NULL
2
Taking false branch
694 mp = vp->v_type == VBLK? vp->v_specmountpointv_un.vu_specinfo->si_mountpoint : vp->v_mount;
695 else
696 mp = NULL((void *)0);
697
698 /*
699 * Remember buffer type, to switch on it later. If the write was
700 * synchronous, but the file system was mounted with MNT_ASYNC,
701 * convert it to a delayed write.
702 * XXX note that this relies on delayed tape writes being converted
703 * to async, not sync writes (which is safe, but ugly).
704 */
705 async = ISSET(bp->b_flags, B_ASYNC)((bp->b_flags) & (0x00000004));
706 if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)((mp->mnt_flag) & (0x00000040))) {
3
Assuming 'async' is not equal to 0
707 /*
708 * Don't convert writes from VND on async filesystems
709 * that already have delayed writes in the upper layer.
710 */
711 if (!ISSET(bp->b_flags, B_NOCACHE)((bp->b_flags) & (0x00001000))) {
712 bdwrite(bp);
713 return (0);
714 }
715 }
716
717 /*
718 * Collect statistics on synchronous and asynchronous writes.
719 * Writes to block devices are charged to their associated
720 * filesystem (if any).
721 */
722 if (mp
3.1
'mp' is equal to NULL
!= NULL((void *)0)) {
4
Taking false branch
723 if (async)
724 mp->mnt_stat.f_asyncwrites++;
725 else
726 mp->mnt_stat.f_syncwrites++;
727 }
728 bcstats.pendingwrites++;
729 bcstats.numwrites++;
730
731 wasdelayed = ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080));
732 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI))((bp->b_flags) &= ~((0x00008000 | 0x00000100 | 0x00000400
| 0x00000080)))
;
733
734 s = splbio()splraise(0x3);
735
736 /*
737 * If not synchronous, pay for the I/O operation and make
738 * sure the buf is on the correct vnode queue. We have
739 * to do this now, because if we don't, the vnode may not
740 * be properly notified that its I/O has completed.
741 */
742 if (wasdelayed) {
5
Assuming 'wasdelayed' is 0
6
Taking false branch
743 reassignbuf(bp);
744 } else
745 curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curproc
->p_ru.ru_oublock++;
746
747
748 /* Initiate disk write. Make sure the appropriate party is charged. */
749 bp->b_vp->v_numoutput++;
7
Access to field 'v_numoutput' results in a dereference of a null pointer (loaded from field 'b_vp')
750 buf_flip_dma(bp);
751 SET(bp->b_flags, B_WRITEINPROG)((bp->b_flags) |= (0x00020000));
752 splx(s)spllower(s);
753 VOP_STRATEGY(bp->b_vp, bp);
754
755 /*
756 * If the queue is above the high water mark, wait till
757 * the number of outstanding write bufs drops below the low
758 * water mark.
759 */
760 if (bp->b_bq)
761 bufq_wait(bp->b_bq);
762
763 if (async)
764 return (0);
765
766 /*
767 * If I/O was synchronous, wait for it to complete.
768 */
769 rv = biowait(bp);
770
771 /* Release the buffer. */
772 brelse(bp);
773
774 return (rv);
775}
776
777
778/*
779 * Delayed write.
780 *
781 * The buffer is marked dirty, but is not queued for I/O.
782 * This routine should be used when the buffer is expected
783 * to be modified again soon, typically a small write that
784 * partially fills a buffer.
785 *
786 * NB: magnetic tapes cannot be delayed; they must be
787 * written in the order that the writes are requested.
788 *
789 * Described in Leffler, et al. (pp. 208-213).
790 */
791void
792bdwrite(struct buf *bp)
793{
794 int s;
795
796 /*
797 * If the block hasn't been seen before:
798 * (1) Mark it as having been seen,
799 * (2) Charge for the write.
800 * (3) Make sure it's on its vnode's correct block list,
801 * (4) If a buffer is rewritten, move it to end of dirty list
802 */
803 if (!ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080))) {
804 SET(bp->b_flags, B_DELWRI)((bp->b_flags) |= (0x00000080));
805 s = splbio()splraise(0x3);
806 buf_flip_dma(bp);
807 reassignbuf(bp);
808 splx(s)spllower(s);
809 curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curproc
->p_ru.ru_oublock++; /* XXX */
810 }
811
812 /* The "write" is done, so mark and release the buffer. */
813 CLR(bp->b_flags, B_NEEDCOMMIT)((bp->b_flags) &= ~(0x00000002));
814 CLR(bp->b_flags, B_NOCACHE)((bp->b_flags) &= ~(0x00001000)); /* Must cache delayed writes */
815 SET(bp->b_flags, B_DONE)((bp->b_flags) |= (0x00000100));
816 brelse(bp);
817}
818
819/*
820 * Asynchronous block write; just an asynchronous bwrite().
821 */
822void
823bawrite(struct buf *bp)
824{
825
826 SET(bp->b_flags, B_ASYNC)((bp->b_flags) |= (0x00000004));
827 VOP_BWRITE(bp);
828}
829
830/*
831 * Must be called at splbio()
832 */
833void
834buf_dirty(struct buf *bp)
835{
836 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
837
838#ifdef DIAGNOSTIC1
839 if (!ISSET(bp->b_flags, B_BUSY)((bp->b_flags) & (0x00000010)))
840 panic("Trying to dirty buffer on freelist!");
841#endif
842
843 if (ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080)) == 0) {
844 SET(bp->b_flags, B_DELWRI)((bp->b_flags) |= (0x00000080));
845 buf_flip_dma(bp);
846 reassignbuf(bp);
847 }
848}
849
850/*
851 * Must be called at splbio()
852 */
853void
854buf_undirty(struct buf *bp)
855{
856 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
857
858#ifdef DIAGNOSTIC1
859 if (!ISSET(bp->b_flags, B_BUSY)((bp->b_flags) & (0x00000010)))
860 panic("Trying to undirty buffer on freelist!");
861#endif
862 if (ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080))) {
863 CLR(bp->b_flags, B_DELWRI)((bp->b_flags) &= ~(0x00000080));
864 reassignbuf(bp);
865 }
866}
867
868/*
869 * Release a buffer on to the free lists.
870 * Described in Bach (p. 46).
871 */
872void
873brelse(struct buf *bp)
874{
875 int s;
876
877 s = splbio()splraise(0x3);
878
879 if (bp->b_data != NULL((void *)0))
880 KASSERT(bp->b_bufsize > 0)((bp->b_bufsize > 0) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 880, "bp->b_bufsize > 0"
))
;
881
882 /*
883 * softdep is basically incompatible with not caching buffers
884 * that have dependencies, so this buffer must be cached
885 */
886 if (LIST_FIRST(&bp->b_dep)((&bp->b_dep)->lh_first) != NULL((void *)0))
887 CLR(bp->b_flags, B_NOCACHE)((bp->b_flags) &= ~(0x00001000));
888
889 /*
890 * Determine which queue the buffer should be on, then put it there.
891 */
892
893 /* If it's not cacheable, or an error, mark it invalid. */
894 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))((bp->b_flags) & ((0x00001000|0x00000400))))
895 SET(bp->b_flags, B_INVAL)((bp->b_flags) |= (0x00000800));
896 /* If it's a write error, also mark the vnode as damaged. */
897 if (ISSET(bp->b_flags, B_ERROR)((bp->b_flags) & (0x00000400)) && !ISSET(bp->b_flags, B_READ)((bp->b_flags) & (0x00008000))) {
898 if (bp->b_vp && bp->b_vp->v_type == VREG)
899 SET(bp->b_vp->v_bioflag, VBIOERROR)((bp->b_vp->v_bioflag) |= (0x0008));
900 }
901
902 if (ISSET(bp->b_flags, B_INVAL)((bp->b_flags) & (0x00000800))) {
903 /*
904 * If the buffer is invalid, free it now rather than leaving
905 * it in a queue and wasting memory.
906 */
907 if (LIST_FIRST(&bp->b_dep)((&bp->b_dep)->lh_first) != NULL((void *)0))
908 buf_deallocate(bp);
909
910 if (ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080))) {
911 CLR(bp->b_flags, B_DELWRI)((bp->b_flags) &= ~(0x00000080));
912 }
913
914 if (bp->b_vp) {
915 RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp)buf_rb_bufs_RBT_REMOVE(&bp->b_vp->v_bufs_tree, bp);
916 brelvp(bp);
917 }
918 bp->b_vp = NULL((void *)0);
919
920 /*
921 * Wake up any processes waiting for _this_ buffer to
922 * become free. They are not allowed to grab it
923 * since it will be freed. But the only sleeper is
924 * getblk and it will restart the operation after
925 * sleep.
926 */
927 if (ISSET(bp->b_flags, B_WANTED)((bp->b_flags) & (0x00010000))) {
928 CLR(bp->b_flags, B_WANTED)((bp->b_flags) &= ~(0x00010000));
929 wakeup(bp);
930 }
931 buf_put(bp);
932 } else {
933 /*
934 * It has valid data. Put it on the end of the appropriate
935 * queue, so that it'll stick around for as long as possible.
936 */
937 bufcache_release(bp);
938
939 /* Unlock the buffer. */
940 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED))((bp->b_flags) &= ~((0x00000001 | 0x00000004 | 0x00001000
| 0x00080000)))
;
941 buf_release(bp);
942
943 /* Wake up any processes waiting for _this_ buffer to
944 * become free. */
945 if (ISSET(bp->b_flags, B_WANTED)((bp->b_flags) & (0x00010000))) {
946 CLR(bp->b_flags, B_WANTED)((bp->b_flags) &= ~(0x00010000));
947 wakeup(bp);
948 }
949
950 if (bcstats.dmapages > targetpages)
951 (void) bufcache_recover_dmapages(0,
952 bcstats.dmapages - targetpages);
953 bufcache_adjust();
954 }
955
956 /* Wake up syncer and cleaner processes waiting for buffers. */
957 if (nobuffers) {
958 nobuffers = 0;
959 wakeup(&nobuffers);
960 }
961
962 /* Wake up any processes waiting for any buffer to become free. */
963 if (needbuffer && bcstats.dmapages < targetpages &&
964 bcstats.kvaslots_avail > RESERVE_SLOTS4) {
965 needbuffer = 0;
966 wakeup(&needbuffer);
967 }
968
969 splx(s)spllower(s);
970}
971
972/*
973 * Determine if a block is in the cache. Just look on what would be its hash
974 * chain. If it's there, return a pointer to it, unless it's marked invalid.
975 */
976static struct buf *
977incore_locked(struct vnode *vp, daddr_t blkno)
978{
979 struct buf *bp;
980 struct buf b;
981
982 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
983
984 /* Search buf lookup tree */
985 b.b_lblkno = blkno;
986 bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b)buf_rb_bufs_RBT_FIND(&vp->v_bufs_tree, &b);
987 if (bp != NULL((void *)0) && ISSET(bp->b_flags, B_INVAL)((bp->b_flags) & (0x00000800)))
988 bp = NULL((void *)0);
989
990 return (bp);
991}
992struct buf *
993incore(struct vnode *vp, daddr_t blkno)
994{
995 struct buf *bp;
996 int s;
997
998 s = splbio()splraise(0x3);
999 bp = incore_locked(vp, blkno);
1000 splx(s)spllower(s);
1001
1002 return (bp);
1003}
1004
1005/*
1006 * Get a block of requested size that is associated with
1007 * a given vnode and block offset. If it is found in the
1008 * block cache, mark it as having been found, make it busy
1009 * and return it. Otherwise, return an empty block of the
1010 * correct size. It is up to the caller to ensure that the
1011 * cached blocks be of the correct size.
1012 */
1013struct buf *
1014getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag,
1015 uint64_t slptimeo)
1016{
1017 struct buf *bp;
1018 struct buf b;
1019 int s, error;
1020
1021 /*
1022 * XXX
1023 * The following is an inlined version of 'incore()', but with
1024 * the 'invalid' test moved to after the 'busy' test. It's
1025 * necessary because there are some cases in which the NFS
1026 * code sets B_INVAL prior to writing data to the server, but
1027 * in which the buffers actually contain valid data. In this
1028 * case, we can't allow the system to allocate a new buffer for
1029 * the block until the write is finished.
1030 */
1031start:
1032 s = splbio()splraise(0x3);
1033 b.b_lblkno = blkno;
1034 bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b)buf_rb_bufs_RBT_FIND(&vp->v_bufs_tree, &b);
1035 if (bp != NULL((void *)0)) {
1036 if (ISSET(bp->b_flags, B_BUSY)((bp->b_flags) & (0x00000010))) {
1037 SET(bp->b_flags, B_WANTED)((bp->b_flags) |= (0x00010000));
1038 error = tsleep_nsec(bp, slpflag | (PRIBIO16 + 1),
1039 "getblk", slptimeo);
1040 splx(s)spllower(s);
1041 if (error)
1042 return (NULL((void *)0));
1043 goto start;
1044 }
1045
1046 if (!ISSET(bp->b_flags, B_INVAL)((bp->b_flags) & (0x00000800))) {
1047 bcstats.cachehits++;
1048 SET(bp->b_flags, B_CACHE)((bp->b_flags) |= (0x00000020));
1049 bufcache_take(bp);
1050 buf_acquire(bp);
1051 splx(s)spllower(s);
1052 return (bp);
1053 }
1054 }
1055 splx(s)spllower(s);
1056
1057 if ((bp = buf_get(vp, blkno, size)) == NULL((void *)0))
1058 goto start;
1059
1060 return (bp);
1061}
1062
1063/*
1064 * Get an empty, disassociated buffer of given size.
1065 */
1066struct buf *
1067geteblk(size_t size)
1068{
1069 struct buf *bp;
1070
1071 while ((bp = buf_get(NULL((void *)0), 0, size)) == NULL((void *)0))
1072 continue;
1073
1074 return (bp);
1075}
1076
1077/*
1078 * Allocate a buffer.
1079 * If vp is given, put it into the buffer cache for that vnode.
1080 * If size != 0, allocate memory and call buf_map().
1081 * If there is already a buffer for the given vnode/blkno, return NULL.
1082 */
1083struct buf *
1084buf_get(struct vnode *vp, daddr_t blkno, size_t size)
1085{
1086 struct buf *bp;
1087 int poolwait = size == 0 ? PR_NOWAIT0x0002 : PR_WAITOK0x0001;
1088 int npages;
1089 int s;
1090
1091 s = splbio()splraise(0x3);
1092 if (size) {
1093 /*
1094 * Wake up the cleaner if we have lots of dirty pages,
1095 * or if we are getting low on buffer cache kva.
1096 */
1097 if (UNCLEAN_PAGES(bcstats.numbufpages - bcstats.numcleanpages) >= hidirtypages ||
1098 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS4)
1099 wakeup(&bd_req);
1100
1101 npages = atop(round_page(size))(((((size) + ((1 << 12) - 1)) & ~((1 << 12) -
1))) >> 12)
;
1102
1103 /*
1104 * if our cache has been previously shrunk,
1105 * allow it to grow again with use up to
1106 * bufhighpages (cachepercent)
1107 */
1108 if (bufpages < bufhighpages)
1109 bufadjust(bufhighpages);
1110
1111 /*
1112 * If we would go over the page target with our
1113 * new allocation, free enough buffers first
1114 * to stay at the target with our new allocation.
1115 */
1116 if (bcstats.dmapages + npages > targetpages) {
1117 (void) bufcache_recover_dmapages(0, npages);
1118 bufcache_adjust();
1119 }
1120
1121 /*
1122 * If we get here, we tried to free the world down
1123 * above, and couldn't get down - Wake the cleaner
1124 * and wait for it to push some buffers out.
1125 */
1126 if ((bcstats.dmapages + npages > targetpages ||
1127 bcstats.kvaslots_avail <= RESERVE_SLOTS4) &&
1128 curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curproc
!= syncerproc && curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r"
(__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self)));
__ci;})->ci_curproc
!= cleanerproc) {
1129 wakeup(&bd_req);
1130 needbuffer++;
1131 tsleep_nsec(&needbuffer, PRIBIO16, "needbuffer", INFSLP0xffffffffffffffffULL);
1132 splx(s)spllower(s);
1133 return (NULL((void *)0));
1134 }
1135 if (bcstats.dmapages + npages > bufpages) {
1136 /* cleaner or syncer */
1137 nobuffers = 1;
1138 tsleep_nsec(&nobuffers, PRIBIO16, "nobuffers", INFSLP0xffffffffffffffffULL);
1139 splx(s)spllower(s);
1140 return (NULL((void *)0));
1141 }
1142 }
1143
1144 bp = pool_get(&bufpool, poolwait|PR_ZERO0x0008);
1145
1146 if (bp == NULL((void *)0)) {
1147 splx(s)spllower(s);
1148 return (NULL((void *)0));
1149 }
1150
1151 bp->b_freelist.tqe_next = NOLIST((struct buf *)0x87654321);
1152 bp->b_dev = NODEV(dev_t)(-1);
1153 LIST_INIT(&bp->b_dep)do { ((&bp->b_dep)->lh_first) = ((void *)0); } while
(0)
;
1154 bp->b_bcount = size;
1155
1156 buf_acquire_nomap(bp);
1157
1158 if (vp != NULL((void *)0)) {
1159 /*
1160 * We insert the buffer into the hash with B_BUSY set
1161 * while we allocate pages for it. This way any getblk
1162 * that happens while we allocate pages will wait for
1163 * this buffer instead of starting its own buf_get.
1164 *
1165 * But first, we check if someone beat us to it.
1166 */
1167 if (incore_locked(vp, blkno)) {
1168 pool_put(&bufpool, bp);
1169 splx(s)spllower(s);
1170 return (NULL((void *)0));
1171 }
1172
1173 bp->b_blkno = bp->b_lblkno = blkno;
1174 bgetvp(vp, bp);
1175 if (RBT_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp)buf_rb_bufs_RBT_INSERT(&vp->v_bufs_tree, bp))
1176 panic("buf_get: dup lblk vp %p bp %p", vp, bp);
1177 } else {
1178 bp->b_vnbufs.le_next = NOLIST((struct buf *)0x87654321);
1179 SET(bp->b_flags, B_INVAL)((bp->b_flags) |= (0x00000800));
1180 bp->b_vp = NULL((void *)0);
1181 }
1182
1183 LIST_INSERT_HEAD(&bufhead, bp, b_list)do { if (((bp)->b_list.le_next = (&bufhead)->lh_first
) != ((void *)0)) (&bufhead)->lh_first->b_list.le_prev
= &(bp)->b_list.le_next; (&bufhead)->lh_first =
(bp); (bp)->b_list.le_prev = &(&bufhead)->lh_first
; } while (0)
;
1184 bcstats.numbufs++;
1185
1186 if (size) {
1187 buf_alloc_pages(bp, round_page(size)(((size) + ((1 << 12) - 1)) & ~((1 << 12) - 1
))
);
1188 KASSERT(ISSET(bp->b_flags, B_DMA))((((bp->b_flags) & (0x04000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 1188, "ISSET(bp->b_flags, B_DMA)"
))
;
1189 buf_map(bp);
1190 }
1191
1192 SET(bp->b_flags, B_BC)((bp->b_flags) |= (0x02000000));
1193 splx(s)spllower(s);
1194
1195 return (bp);
1196}
1197
1198/*
1199 * Buffer cleaning daemon.
1200 */
1201void
1202buf_daemon(void *arg)
1203{
1204 struct buf *bp = NULL((void *)0);
1205 int s, pushed = 0;
1206
1207 s = splbio()splraise(0x3);
1208 for (;;) {
1209 if (bp == NULL((void *)0) || (pushed >= 16 &&
1210 UNCLEAN_PAGES(bcstats.numbufpages - bcstats.numcleanpages) < hidirtypages &&
1211 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS4)){
1212 pushed = 0;
1213 /*
1214 * Wake up anyone who was waiting for buffers
1215 * to be released.
1216 */
1217 if (needbuffer) {
1218 needbuffer = 0;
1219 wakeup(&needbuffer);
1220 }
1221 tsleep_nsec(&bd_req, PRIBIO16 - 7, "cleaner", INFSLP0xffffffffffffffffULL);
1222 }
1223
1224 while ((bp = bufcache_getdirtybuf())) {
1225 TRACEPOINT(vfs, cleaner, bp->b_flags, pushed,do { extern struct dt_probe (dt_static_vfs_cleaner); struct dt_probe
*dtp = &(dt_static_vfs_cleaner); if (__builtin_expect(((
dt_tracing) != 0), 0) && __builtin_expect(((dtp->dtp_recording
) != 0), 0)) { struct dt_provider *dtpv = dtp->dtp_prov; dtpv
->dtpv_enter(dtpv, dtp, bp->b_flags, pushed, lodirtypages
, hidirtypages); } } while (0)
1226 lodirtypages, hidirtypages)do { extern struct dt_probe (dt_static_vfs_cleaner); struct dt_probe
*dtp = &(dt_static_vfs_cleaner); if (__builtin_expect(((
dt_tracing) != 0), 0) && __builtin_expect(((dtp->dtp_recording
) != 0), 0)) { struct dt_provider *dtpv = dtp->dtp_prov; dtpv
->dtpv_enter(dtpv, dtp, bp->b_flags, pushed, lodirtypages
, hidirtypages); } } while (0)
;
1227
1228 if (UNCLEAN_PAGES(bcstats.numbufpages - bcstats.numcleanpages) < lodirtypages &&
1229 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS4 &&
1230 pushed >= 16)
1231 break;
1232
1233 bufcache_take(bp);
1234 buf_acquire(bp);
1235 splx(s)spllower(s);
1236
1237 if (ISSET(bp->b_flags, B_INVAL)((bp->b_flags) & (0x00000800))) {
1238 brelse(bp);
1239 s = splbio()splraise(0x3);
1240 continue;
1241 }
1242#ifdef DIAGNOSTIC1
1243 if (!ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080)))
1244 panic("Clean buffer on dirty queue");
1245#endif
1246 if (LIST_FIRST(&bp->b_dep)((&bp->b_dep)->lh_first) != NULL((void *)0) &&
1247 !ISSET(bp->b_flags, B_DEFERRED)((bp->b_flags) & (0x00080000)) &&
1248 buf_countdeps(bp, 0, 0)) {
1249 SET(bp->b_flags, B_DEFERRED)((bp->b_flags) |= (0x00080000));
1250 s = splbio()splraise(0x3);
1251 bufcache_release(bp);
1252 buf_release(bp);
1253 continue;
1254 }
1255
1256 bawrite(bp);
1257 pushed++;
1258
1259 sched_pause(yield)do { if (({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0"
: "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self
))); __ci;})->ci_schedstate.spc_schedflags & 0x0002) yield
(); } while (0)
;
1260
1261 s = splbio()splraise(0x3);
1262 }
1263 }
1264}
1265
1266/*
1267 * Wait for operations on the buffer to complete.
1268 * When they do, extract and return the I/O's error value.
1269 */
1270int
1271biowait(struct buf *bp)
1272{
1273 int s;
1274
1275 KASSERT(!(bp->b_flags & B_ASYNC))((!(bp->b_flags & 0x00000004)) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 1275, "!(bp->b_flags & B_ASYNC)"
))
;
1276
1277 s = splbio()splraise(0x3);
1278 while (!ISSET(bp->b_flags, B_DONE)((bp->b_flags) & (0x00000100)))
1279 tsleep_nsec(bp, PRIBIO16 + 1, "biowait", INFSLP0xffffffffffffffffULL);
1280 splx(s)spllower(s);
1281
1282 /* check for interruption of I/O (e.g. via NFS), then errors. */
1283 if (ISSET(bp->b_flags, B_EINTR)((bp->b_flags) & (0x00000200))) {
1284 CLR(bp->b_flags, B_EINTR)((bp->b_flags) &= ~(0x00000200));
1285 return (EINTR4);
1286 }
1287
1288 if (ISSET(bp->b_flags, B_ERROR)((bp->b_flags) & (0x00000400)))
1289 return (bp->b_error ? bp->b_error : EIO5);
1290 else
1291 return (0);
1292}
1293
1294/*
1295 * Mark I/O complete on a buffer.
1296 *
1297 * If a callback has been requested, e.g. the pageout
1298 * daemon, do so. Otherwise, awaken waiting processes.
1299 *
1300 * [ Leffler, et al., says on p.247:
1301 * "This routine wakes up the blocked process, frees the buffer
1302 * for an asynchronous write, or, for a request by the pagedaemon
1303 * process, invokes a procedure specified in the buffer structure" ]
1304 *
1305 * In real life, the pagedaemon (or other system processes) wants
1306 * to do async stuff to, and doesn't want the buffer brelse()'d.
1307 * (for swap pager, that puts swap buffers on the free lists (!!!),
1308 * for the vn device, that puts malloc'd buffers on the free lists!)
1309 *
1310 * Must be called at splbio().
1311 */
1312void
1313biodone(struct buf *bp)
1314{
1315 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
1316
1317 if (ISSET(bp->b_flags, B_DONE)((bp->b_flags) & (0x00000100)))
1318 panic("biodone already");
1319 SET(bp->b_flags, B_DONE)((bp->b_flags) |= (0x00000100)); /* note that it's done */
1320
1321 if (bp->b_bq)
1322 bufq_done(bp->b_bq, bp);
1323
1324 if (LIST_FIRST(&bp->b_dep)((&bp->b_dep)->lh_first) != NULL((void *)0))
1325 buf_complete(bp);
1326
1327 if (!ISSET(bp->b_flags, B_READ)((bp->b_flags) & (0x00008000))) {
1328 CLR(bp->b_flags, B_WRITEINPROG)((bp->b_flags) &= ~(0x00020000));
1329 vwakeup(bp->b_vp);
1330 }
1331 if (bcstats.numbufs &&
1332 (!(ISSET(bp->b_flags, B_RAW)((bp->b_flags) & (0x00004000)) || ISSET(bp->b_flags, B_PHYS)((bp->b_flags) & (0x00002000))))) {
1333 if (!ISSET(bp->b_flags, B_READ)((bp->b_flags) & (0x00008000))) {
1334 bcstats.pendingwrites--;
1335 } else
1336 bcstats.pendingreads--;
1337 }
1338 if (ISSET(bp->b_flags, B_CALL)((bp->b_flags) & (0x00000040))) { /* if necessary, call out */
1339 CLR(bp->b_flags, B_CALL)((bp->b_flags) &= ~(0x00000040)); /* but note callout done */
1340 (*bp->b_iodone)(bp);
1341 } else {
1342 if (ISSET(bp->b_flags, B_ASYNC)((bp->b_flags) & (0x00000004))) {/* if async, release it */
1343 brelse(bp);
1344 } else { /* or just wakeup the buffer */
1345 CLR(bp->b_flags, B_WANTED)((bp->b_flags) &= ~(0x00010000));
1346 wakeup(bp);
1347 }
1348 }
1349}
1350
1351#ifdef DDB1
1352void bcstats_print(int (*)(const char *, ...)
1353 __attribute__((__format__(__kprintf__,1,2))));
1354/*
1355 * bcstats_print: ddb hook to print interesting buffer cache counters
1356 */
1357void
1358bcstats_print(
1359 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1360{
1361 (*pr)("Current Buffer Cache status:\n");
1362 (*pr)("numbufs %lld busymapped %lld, delwri %lld\n",
1363 bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
1364 (*pr)("kvaslots %lld avail kva slots %lld\n",
1365 bcstats.kvaslots, bcstats.kvaslots_avail);
1366 (*pr)("bufpages %lld, dmapages %lld, dirtypages %lld\n",
1367 bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages);
1368 (*pr)("pendingreads %lld, pendingwrites %lld\n",
1369 bcstats.pendingreads, bcstats.pendingwrites);
1370 (*pr)("highflips %lld, highflops %lld, dmaflips %lld\n",
1371 bcstats.highflips, bcstats.highflops, bcstats.dmaflips);
1372}
1373#endif
1374
1375void
1376buf_adjcnt(struct buf *bp, long ncount)
1377{
1378 KASSERT(ncount <= bp->b_bufsize)((ncount <= bp->b_bufsize) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 1378, "ncount <= bp->b_bufsize"
))
;
1379 bp->b_bcount = ncount;
1380}
1381
1382/* bufcache freelist code below */
1383/*
1384 * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org>
1385 *
1386 * Permission to use, copy, modify, and distribute this software for any
1387 * purpose with or without fee is hereby granted, provided that the above
1388 * copyright notice and this permission notice appear in all copies.
1389 *
1390 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1391 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1392 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1393 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1394 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1395 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1396 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1397 */
1398
1399/*
1400 * The code below implements a variant of the 2Q buffer cache algorithm by
1401 * Johnson and Shasha.
1402 *
1403 * General Outline
1404 * We divide the buffer cache into three working sets: current, previous,
1405 * and long term. Each list is itself LRU and buffers get promoted and moved
1406 * around between them. A buffer starts its life in the current working set.
1407 * As time passes and newer buffers push it out, it will turn into the previous
1408 * working set and is subject to recycling. But if it's accessed again from
1409 * the previous working set, that's an indication that it's actually in the
1410 * long term working set, so we promote it there. The separation of current
1411 * and previous working sets prevents us from promoting a buffer that's only
1412 * temporarily hot to the long term cache.
1413 *
1414 * The objective is to provide scan resistance by making the long term
1415 * working set ineligible for immediate recycling, even as the current
1416 * working set is rapidly turned over.
1417 *
1418 * Implementation
1419 * The code below identifies the current, previous, and long term sets as
1420 * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at
1421 * 1/3 of the total clean pages, after which point they start pushing their
1422 * oldest buffers into coldqueue.
1423 * A buf always starts out with neither WARM or COLD flags set (implying HOT).
1424 * When released, it will be returned to the tail of the hotqueue list.
1425 * When the hotqueue gets too large, the oldest hot buf will be moved to the
1426 * coldqueue, with the B_COLD flag set. When a cold buf is released, we set
1427 * the B_WARM flag and put it onto the warmqueue. Warm bufs are also
1428 * directly returned to the end of the warmqueue. As with the hotqueue, when
1429 * the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue.
1430 *
1431 * Note that this design does still support large working sets, greater
1432 * than the cap of hotqueue or warmqueue would imply. The coldqueue is still
1433 * cached and has no maximum length. The hot and warm queues form a Y feeding
1434 * into the coldqueue. Moving bufs between queues is constant time, so this
1435 * design decays to one long warm->cold queue.
1436 *
1437 * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue
1438 * is Am. We always cache pages, as opposed to pointers to pages for A1.
1439 *
1440 * This implementation adds support for multiple 2q caches.
1441 *
1442 * If we have more than one 2q cache, as bufs fall off the cold queue
1443 * for recycling, bufs that have been warm before (which retain the
1444 * B_WARM flag in addition to B_COLD) can be put into the hot queue of
1445 * a second level 2Q cache. buffers which are only B_COLD are
1446 * recycled. Bufs falling off the last cache's cold queue are always
1447 * recycled.
1448 *
1449 */
1450
1451/*
1452 * this function is called when a hot or warm queue may have exceeded its
1453 * size limit. it will move a buf to the coldqueue.
1454 */
1455int chillbufs(struct
1456 bufcache *cache, struct bufqueue *queue, int64_t *queuepages);
1457
1458void
1459bufcache_init(void)
1460{
1461 int i;
1462
1463 for (i = 0; i < NUM_CACHES2; i++) {
1464 TAILQ_INIT(&cleancache[i].hotqueue)do { (&cleancache[i].hotqueue)->tqh_first = ((void *)0
); (&cleancache[i].hotqueue)->tqh_last = &(&cleancache
[i].hotqueue)->tqh_first; } while (0)
;
1465 TAILQ_INIT(&cleancache[i].coldqueue)do { (&cleancache[i].coldqueue)->tqh_first = ((void *)
0); (&cleancache[i].coldqueue)->tqh_last = &(&
cleancache[i].coldqueue)->tqh_first; } while (0)
;
1466 TAILQ_INIT(&cleancache[i].warmqueue)do { (&cleancache[i].warmqueue)->tqh_first = ((void *)
0); (&cleancache[i].warmqueue)->tqh_last = &(&
cleancache[i].warmqueue)->tqh_first; } while (0)
;
1467 }
1468 TAILQ_INIT(&dirtyqueue)do { (&dirtyqueue)->tqh_first = ((void *)0); (&dirtyqueue
)->tqh_last = &(&dirtyqueue)->tqh_first; } while
(0)
;
1469}
1470
1471/*
1472 * if the buffer caches have shrunk, we may need to rebalance our queues.
1473 */
1474void
1475bufcache_adjust(void)
1476{
1477 int i;
1478
1479 for (i = 0; i < NUM_CACHES2; i++) {
1480 while (chillbufs(&cleancache[i], &cleancache[i].warmqueue,
1481 &cleancache[i].warmbufpages) ||
1482 chillbufs(&cleancache[i], &cleancache[i].hotqueue,
1483 &cleancache[i].hotbufpages))
1484 continue;
1485 }
1486}
1487
1488/*
1489 * Get a clean buffer from the cache. if "discard" is set do not promote
1490 * previously warm buffers as normal, because we are tossing everything
1491 * away such as in a hibernation
1492 */
1493struct buf *
1494bufcache_getcleanbuf(int cachenum, int discard)
1495{
1496 struct buf *bp = NULL((void *)0);
1497 struct bufcache *cache = &cleancache[cachenum];
1498 struct bufqueue * queue;
1499
1500 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
1501
1502 /* try cold queue */
1503 while ((bp = TAILQ_FIRST(&cache->coldqueue)((&cache->coldqueue)->tqh_first)) ||
1504 (bp = TAILQ_FIRST(&cache->warmqueue)((&cache->warmqueue)->tqh_first)) ||
1505 (bp = TAILQ_FIRST(&cache->hotqueue)((&cache->hotqueue)->tqh_first))) {
1506 int64_t pages = atop(bp->b_bufsize)((bp->b_bufsize) >> 12);
1507 struct bufcache *newcache;
1508
1509 if (discard || cachenum >= NUM_CACHES2 - 1) {
1510 /* Victim selected, give it up */
1511 return bp;
1512 }
1513 KASSERT(bp->cache == cachenum)((bp->cache == cachenum) ? (void)0 : __assert("diagnostic "
, "/usr/src/sys/kern/vfs_bio.c", 1513, "bp->cache == cachenum"
))
;
1514
1515 /*
1516 * If this buffer was warm before, move it to
1517 * the hot queue in the next cache
1518 */
1519
1520 if (fliphigh) {
1521 /*
1522 * If we are in the DMA cache, try to flip the
1523 * buffer up high to move it on to the other
1524 * caches. if we can't move the buffer to high
1525 * memory without sleeping, we give it up and
1526 * return it rather than fight for more memory
1527 * against non buffer cache competitors.
1528 */
1529 SET(bp->b_flags, B_BUSY)((bp->b_flags) |= (0x00000010));
1530 if (bp->cache == 0 && buf_flip_high(bp) == -1) {
1531 CLR(bp->b_flags, B_BUSY)((bp->b_flags) &= ~(0x00000010));
1532 return bp;
1533 }
1534 CLR(bp->b_flags, B_BUSY)((bp->b_flags) &= ~(0x00000010));
1535 }
1536
1537 /* Move the buffer to the hot queue in the next cache */
1538 if (ISSET(bp->b_flags, B_COLD)((bp->b_flags) & (0x01000000))) {
1539 queue = &cache->coldqueue;
1540 } else if (ISSET(bp->b_flags, B_WARM)((bp->b_flags) & (0x00800000))) {
1541 queue = &cache->warmqueue;
1542 cache->warmbufpages -= pages;
1543 } else {
1544 queue = &cache->hotqueue;
1545 cache->hotbufpages -= pages;
1546 }
1547 TAILQ_REMOVE(queue, bp, b_freelist)do { if (((bp)->b_freelist.tqe_next) != ((void *)0)) (bp)->
b_freelist.tqe_next->b_freelist.tqe_prev = (bp)->b_freelist
.tqe_prev; else (queue)->tqh_last = (bp)->b_freelist.tqe_prev
; *(bp)->b_freelist.tqe_prev = (bp)->b_freelist.tqe_next
; ((bp)->b_freelist.tqe_prev) = ((void *)-1); ((bp)->b_freelist
.tqe_next) = ((void *)-1); } while (0)
;
1548 cache->cachepages -= pages;
1549 CLR(bp->b_flags, B_WARM)((bp->b_flags) &= ~(0x00800000));
1550 CLR(bp->b_flags, B_COLD)((bp->b_flags) &= ~(0x01000000));
1551 bp->cache++;
1552 newcache= &cleancache[bp->cache];
1553 newcache->cachepages += pages;
1554 newcache->hotbufpages += pages;
1555 chillbufs(newcache, &newcache->hotqueue,
1556 &newcache->hotbufpages);
1557 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist)do { (bp)->b_freelist.tqe_next = ((void *)0); (bp)->b_freelist
.tqe_prev = (&newcache->hotqueue)->tqh_last; *(&
newcache->hotqueue)->tqh_last = (bp); (&newcache->
hotqueue)->tqh_last = &(bp)->b_freelist.tqe_next; }
while (0)
;
1558 }
1559 return bp;
1560}
1561
1562
1563void
1564discard_buffer(struct buf *bp)
1565{
1566 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
1567
1568 bufcache_take(bp);
1569 if (bp->b_vp) {
1570 RBT_REMOVE(buf_rb_bufs,buf_rb_bufs_RBT_REMOVE(&bp->b_vp->v_bufs_tree, bp)
1571 &bp->b_vp->v_bufs_tree, bp)buf_rb_bufs_RBT_REMOVE(&bp->b_vp->v_bufs_tree, bp);
1572 brelvp(bp);
1573 }
1574 buf_put(bp);
1575}
1576
1577int64_t
1578bufcache_recover_dmapages(int discard, int64_t howmany)
1579{
1580 struct buf *bp = NULL((void *)0);
1581 struct bufcache *cache = &cleancache[DMA_CACHE0];
1582 struct bufqueue * queue;
1583 int64_t recovered = 0;
1584
1585 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
1586
1587 while ((recovered < howmany) &&
1588 ((bp = TAILQ_FIRST(&cache->coldqueue)((&cache->coldqueue)->tqh_first)) ||
1589 (bp = TAILQ_FIRST(&cache->warmqueue)((&cache->warmqueue)->tqh_first)) ||
1590 (bp = TAILQ_FIRST(&cache->hotqueue)((&cache->hotqueue)->tqh_first)))) {
1591 int64_t pages = atop(bp->b_bufsize)((bp->b_bufsize) >> 12);
1592 struct bufcache *newcache;
1593
1594 if (discard || DMA_CACHE0 >= NUM_CACHES2 - 1) {
1595 discard_buffer(bp);
1596 continue;
1597 }
1598 KASSERT(bp->cache == DMA_CACHE)((bp->cache == 0) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 1598, "bp->cache == DMA_CACHE"))
;
1599
1600 /*
1601 * If this buffer was warm before, move it to
1602 * the hot queue in the next cache
1603 */
1604
1605 /*
1606 * One way or another, the pages for this
1607 * buffer are leaving DMA memory
1608 */
1609 recovered += pages;
1610
1611 if (!fliphigh) {
1612 discard_buffer(bp);
1613 continue;
1614 }
1615
1616 /*
1617 * If we are in the DMA cache, try to flip the
1618 * buffer up high to move it on to the other
1619 * caches. if we can't move the buffer to high
1620 * memory without sleeping, we give it up
1621 * now rather than fight for more memory
1622 * against non buffer cache competitors.
1623 */
1624 SET(bp->b_flags, B_BUSY)((bp->b_flags) |= (0x00000010));
1625 if (bp->cache == 0 && buf_flip_high(bp) == -1) {
1626 CLR(bp->b_flags, B_BUSY)((bp->b_flags) &= ~(0x00000010));
1627 discard_buffer(bp);
1628 continue;
1629 }
1630 CLR(bp->b_flags, B_BUSY)((bp->b_flags) &= ~(0x00000010));
1631
1632 /*
1633 * Move the buffer to the hot queue in the next cache
1634 */
1635 if (ISSET(bp->b_flags, B_COLD)((bp->b_flags) & (0x01000000))) {
1636 queue = &cache->coldqueue;
1637 } else if (ISSET(bp->b_flags, B_WARM)((bp->b_flags) & (0x00800000))) {
1638 queue = &cache->warmqueue;
1639 cache->warmbufpages -= pages;
1640 } else {
1641 queue = &cache->hotqueue;
1642 cache->hotbufpages -= pages;
1643 }
1644 TAILQ_REMOVE(queue, bp, b_freelist)do { if (((bp)->b_freelist.tqe_next) != ((void *)0)) (bp)->
b_freelist.tqe_next->b_freelist.tqe_prev = (bp)->b_freelist
.tqe_prev; else (queue)->tqh_last = (bp)->b_freelist.tqe_prev
; *(bp)->b_freelist.tqe_prev = (bp)->b_freelist.tqe_next
; ((bp)->b_freelist.tqe_prev) = ((void *)-1); ((bp)->b_freelist
.tqe_next) = ((void *)-1); } while (0)
;
1645 cache->cachepages -= pages;
1646 CLR(bp->b_flags, B_WARM)((bp->b_flags) &= ~(0x00800000));
1647 CLR(bp->b_flags, B_COLD)((bp->b_flags) &= ~(0x01000000));
1648 bp->cache++;
1649 newcache= &cleancache[bp->cache];
1650 newcache->cachepages += pages;
1651 newcache->hotbufpages += pages;
1652 chillbufs(newcache, &newcache->hotqueue,
1653 &newcache->hotbufpages);
1654 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist)do { (bp)->b_freelist.tqe_next = ((void *)0); (bp)->b_freelist
.tqe_prev = (&newcache->hotqueue)->tqh_last; *(&
newcache->hotqueue)->tqh_last = (bp); (&newcache->
hotqueue)->tqh_last = &(bp)->b_freelist.tqe_next; }
while (0)
;
1655 }
1656 return recovered;
1657}
1658
1659struct buf *
1660bufcache_getcleanbuf_range(int start, int end, int discard)
1661{
1662 int i, j = start, q = end;
1663 struct buf *bp = NULL((void *)0);
1664
1665 /*
1666 * XXX in theory we could promote warm buffers into a previous queue
1667 * so in the pathological case of where we go through all the caches
1668 * without getting a buffer we have to start at the beginning again.
1669 */
1670 while (j <= q) {
1671 for (i = q; i >= j; i--)
1672 if ((bp = bufcache_getcleanbuf(i, discard)))
1673 return (bp);
1674 j++;
1675 }
1676 return bp;
1677}
1678
1679struct buf *
1680bufcache_gethighcleanbuf(void)
1681{
1682 if (!fliphigh)
1683 return NULL((void *)0);
1684 return bufcache_getcleanbuf_range(DMA_CACHE0 + 1, NUM_CACHES2 - 1, 0);
1685}
1686
1687
1688struct buf *
1689bufcache_getdmacleanbuf(void)
1690{
1691 if (fliphigh)
1692 return bufcache_getcleanbuf_range(DMA_CACHE0, DMA_CACHE0, 0);
1693 return bufcache_getcleanbuf_range(DMA_CACHE0, NUM_CACHES2 - 1, 0);
1694}
1695
1696
1697struct buf *
1698bufcache_getdirtybuf(void)
1699{
1700 return TAILQ_FIRST(&dirtyqueue)((&dirtyqueue)->tqh_first);
1701}
1702
1703void
1704bufcache_take(struct buf *bp)
1705{
1706 struct bufqueue *queue;
1707 int64_t pages;
1708
1709 splassert(IPL_BIO)do { if (splassert_ctl > 0) { splassert_check(0x3, __func__
); } } while (0)
;
1710 KASSERT(ISSET(bp->b_flags, B_BC))((((bp->b_flags) & (0x02000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 1710, "ISSET(bp->b_flags, B_BC)"
))
;
1711 KASSERT(bp->cache >= DMA_CACHE)((bp->cache >= 0) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 1711, "bp->cache >= DMA_CACHE"))
;
1712 KASSERT((bp->cache < NUM_CACHES))(((bp->cache < 2)) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/vfs_bio.c"
, 1712, "(bp->cache < NUM_CACHES)"))
;
1713
1714 pages = atop(bp->b_bufsize)((bp->b_bufsize) >> 12);
1715
1716 TRACEPOINT(vfs, bufcache_take, bp->b_flags, bp->cache, pages)do { extern struct dt_probe (dt_static_vfs_bufcache_take); struct
dt_probe *dtp = &(dt_static_vfs_bufcache_take); if (__builtin_expect
(((dt_tracing) != 0), 0) && __builtin_expect(((dtp->
dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp->
dtp_prov; dtpv->dtpv_enter(dtpv, dtp, bp->b_flags, bp->
cache, pages); } } while (0)
;
1717
1718 struct bufcache *cache = &cleancache[bp->cache];
1719 if (!ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080))) {
1720 if (ISSET(bp->b_flags, B_COLD)((bp->b_flags) & (0x01000000))) {
1721 queue = &cache->coldqueue;
1722 } else if (ISSET(bp->b_flags, B_WARM)((bp->b_flags) & (0x00800000))) {
1723 queue = &cache->warmqueue;
1724 cache->warmbufpages -= pages;
1725 } else {
1726 queue = &cache->hotqueue;
1727 cache->hotbufpages -= pages;
1728 }
1729 bcstats.numcleanpages -= pages;
1730 cache->cachepages -= pages;
1731 } else {
1732 queue = &dirtyqueue;
1733 bcstats.numdirtypages -= pages;
1734 bcstats.delwribufs--;
1735 }
1736 TAILQ_REMOVE(queue, bp, b_freelist)do { if (((bp)->b_freelist.tqe_next) != ((void *)0)) (bp)->
b_freelist.tqe_next->b_freelist.tqe_prev = (bp)->b_freelist
.tqe_prev; else (queue)->tqh_last = (bp)->b_freelist.tqe_prev
; *(bp)->b_freelist.tqe_prev = (bp)->b_freelist.tqe_next
; ((bp)->b_freelist.tqe_prev) = ((void *)-1); ((bp)->b_freelist
.tqe_next) = ((void *)-1); } while (0)
;
1737}
1738
1739/* move buffers from a hot or warm queue to a cold queue in a cache */
1740int
1741chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages)
1742{
1743 struct buf *bp;
1744 int64_t limit, pages;
1745
1746 /*
1747 * We limit the hot queue to be small, with a max of 4096 pages.
1748 * We limit the warm queue to half the cache size.
1749 *
1750 * We impose a minimum size of 96 to prevent too much "wobbling".
1751 */
1752 if (queue == &cache->hotqueue)
1753 limit = min(cache->cachepages / 20, 4096);
1754 else if (queue == &cache->warmqueue)
1755 limit = (cache->cachepages / 2);
1756 else
1757 panic("chillbufs: invalid queue");
1758
1759 if (*queuepages > 96 && *queuepages > limit) {
1760 bp = TAILQ_FIRST(queue)((queue)->tqh_first);
1761 if (!bp)
1762 panic("inconsistent bufpage counts");
1763 pages = atop(bp->b_bufsize)((bp->b_bufsize) >> 12);
1764 *queuepages -= pages;
1765 TAILQ_REMOVE(queue, bp, b_freelist)do { if (((bp)->b_freelist.tqe_next) != ((void *)0)) (bp)->
b_freelist.tqe_next->b_freelist.tqe_prev = (bp)->b_freelist
.tqe_prev; else (queue)->tqh_last = (bp)->b_freelist.tqe_prev
; *(bp)->b_freelist.tqe_prev = (bp)->b_freelist.tqe_next
; ((bp)->b_freelist.tqe_prev) = ((void *)-1); ((bp)->b_freelist
.tqe_next) = ((void *)-1); } while (0)
;
1766 /* we do not clear B_WARM */
1767 SET(bp->b_flags, B_COLD)((bp->b_flags) |= (0x01000000));
1768 TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist)do { (bp)->b_freelist.tqe_next = ((void *)0); (bp)->b_freelist
.tqe_prev = (&cache->coldqueue)->tqh_last; *(&cache
->coldqueue)->tqh_last = (bp); (&cache->coldqueue
)->tqh_last = &(bp)->b_freelist.tqe_next; } while (
0)
;
1769 return 1;
1770 }
1771 return 0;
1772}
1773
1774void
1775bufcache_release(struct buf *bp)
1776{
1777 struct bufqueue *queue;
1778 int64_t pages;
1779 struct bufcache *cache = &cleancache[bp->cache];
1780
1781 KASSERT(ISSET(bp->b_flags, B_BC))((((bp->b_flags) & (0x02000000))) ? (void)0 : __assert
("diagnostic ", "/usr/src/sys/kern/vfs_bio.c", 1781, "ISSET(bp->b_flags, B_BC)"
))
;
1782 pages = atop(bp->b_bufsize)((bp->b_bufsize) >> 12);
1783
1784 TRACEPOINT(vfs, bufcache_rel, bp->b_flags, bp->cache, pages)do { extern struct dt_probe (dt_static_vfs_bufcache_rel); struct
dt_probe *dtp = &(dt_static_vfs_bufcache_rel); if (__builtin_expect
(((dt_tracing) != 0), 0) && __builtin_expect(((dtp->
dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp->
dtp_prov; dtpv->dtpv_enter(dtpv, dtp, bp->b_flags, bp->
cache, pages); } } while (0)
;
1785
1786 if (fliphigh) {
1787 if (ISSET(bp->b_flags, B_DMA)((bp->b_flags) & (0x04000000)) && bp->cache > 0)
1788 panic("B_DMA buffer release from cache %d",
1789 bp->cache);
1790 else if ((!ISSET(bp->b_flags, B_DMA)((bp->b_flags) & (0x04000000))) && bp->cache == 0)
1791 panic("Non B_DMA buffer release from cache %d",
1792 bp->cache);
1793 }
1794
1795 if (!ISSET(bp->b_flags, B_DELWRI)((bp->b_flags) & (0x00000080))) {
1796 int64_t *queuepages;
1797 if (ISSET(bp->b_flags, B_WARM | B_COLD)((bp->b_flags) & (0x00800000 | 0x01000000))) {
1798 SET(bp->b_flags, B_WARM)((bp->b_flags) |= (0x00800000));
1799 CLR(bp->b_flags, B_COLD)((bp->b_flags) &= ~(0x01000000));
1800 queue = &cache->warmqueue;
1801 queuepages = &cache->warmbufpages;
1802 } else {
1803 queue = &cache->hotqueue;
1804 queuepages = &cache->hotbufpages;
1805 }
1806 *queuepages += pages;
1807 bcstats.numcleanpages += pages;
1808 cache->cachepages += pages;
1809 chillbufs(cache, queue, queuepages);
1810 } else {
1811 queue = &dirtyqueue;
1812 bcstats.numdirtypages += pages;
1813 bcstats.delwribufs++;
1814 }
1815 TAILQ_INSERT_TAIL(queue, bp, b_freelist)do { (bp)->b_freelist.tqe_next = ((void *)0); (bp)->b_freelist
.tqe_prev = (queue)->tqh_last; *(queue)->tqh_last = (bp
); (queue)->tqh_last = &(bp)->b_freelist.tqe_next; }
while (0)
;
1816}
1817
1818#ifdef HIBERNATE1
1819/*
1820 * Nuke the buffer cache from orbit when hibernating. We do not want to save
1821 * any clean cache pages to swap and read them back. the original disk files
1822 * are just as good.
1823 */
1824void
1825hibernate_suspend_bufcache(void)
1826{
1827 struct buf *bp;
1828 int s;
1829
1830 s = splbio()splraise(0x3);
1831 /* Chuck away all the cache pages.. discard bufs, do not promote */
1832 while ((bp = bufcache_getcleanbuf_range(DMA_CACHE0, NUM_CACHES2 - 1, 1))) {
1833 bufcache_take(bp);
1834 if (bp->b_vp) {
1835 RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp)buf_rb_bufs_RBT_REMOVE(&bp->b_vp->v_bufs_tree, bp);
1836 brelvp(bp);
1837 }
1838 buf_put(bp);
1839 }
1840 splx(s)spllower(s);
1841}
1842
1843void
1844hibernate_resume_bufcache(void)
1845{
1846 /* XXX Nothing needed here for now */
1847}
1848#endif /* HIBERNATE */