1 /*
2 * Copyright (c) 2019-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * This code is derived from software contributed to Berkeley by
33 * Scooter Morris at Genentech Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
60 */
61
62 #include <sys/cdefs.h>
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/lock.h>
67 #include <sys/mount.h>
68 #include <sys/proc.h>
69 #include <sys/signalvar.h>
70 #include <sys/unistd.h>
71 #include <sys/user.h>
72 #include <sys/vnode.h>
73 #include <sys/vnode_internal.h>
74 #include <sys/vnode_if.h>
75 #include <sys/malloc.h>
76 #include <sys/fcntl.h>
77 #include <sys/lockf.h>
78 #include <sys/sdt.h>
79 #include <kern/policy_internal.h>
80
81 #include <sys/file_internal.h>
82
83 #if (DEVELOPMENT || DEBUG)
84 #define LOCKF_DEBUGGING 1
85 #endif
86
87 #ifdef LOCKF_DEBUGGING
88 #include <sys/sysctl.h>
89 void lf_print(const char *tag, struct lockf *lock);
90 void lf_printlist(const char *tag, struct lockf *lock);
91
92 #define LF_DBG_LOCKOP (1 << 0) /* setlk, getlk, clearlk */
93 #define LF_DBG_LIST (1 << 1) /* split, coalesce */
94 #define LF_DBG_IMPINH (1 << 2) /* importance inheritance */
95 #define LF_DBG_TRACE (1 << 3) /* errors, exit */
96 #define LF_DBG_DEADLOCK (1 << 4) /* deadlock detection */
97
98 static int lockf_debug = 0; /* was 2, could be 3 ;-) */
99 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
100
101 /*
102 * If the selector is set, then output the debugging diagnostic.
103 */
104 #define LOCKF_DEBUG(mask, ...) \
105 do { \
106 if ((mask) & lockf_debug) { \
107 printf("%s>", __FUNCTION__); \
108 printf(__VA_ARGS__); \
109 } \
110 } while(0)
111
112 #define LOCKF_DEBUGP(mask) \
113 ({ \
114 ((mask) & lockf_debug); \
115 })
116 #else /* !LOCKF_DEBUGGING */
117 #define LOCKF_DEBUG(mask, ...) /* mask */
118 #endif /* !LOCKF_DEBUGGING */
119
120 KALLOC_TYPE_DEFINE(KT_LOCKF, struct lockf, KT_PRIV_ACCT);
121
122 #define NOLOCKF (struct lockf *)0
123 #define SELF 0x1
124 #define OTHERS 0x2
125 #define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
126
127 /* return the effective end of a 'struct lockf': lf_end == -1 is OFF_MAX */
128 #define LF_END(l) ((l)->lf_end == -1 ? OFF_MAX : (l)->lf_end)
129
130 /*
131 * Overlapping lock states
132 *
133 * For lk_find_overlap(..., SELF, ...), the possible sequences are a single:
134 * - OVERLAP_NONE,
135 * - OVERLAP_EQUALS_LOCK, or
136 * - OVERLAP_CONTAINS_LOCK
137 *
138 * or the following sequence:
139 * - optional OVERLAP_STARTS_BEFORE_LOCK
140 * - zero or more OVERLAP_CONTAINED_BY_LOCK
141 * - optional OVERLAP_ENDS_AFTER_LOCK
142 * - OVERLAP_NONE
143 *
144 * In the annotations:
145 * - the search lock is [SS, SE] and
146 * - the returned overlap lock is [OS,OE].
147 */
148 typedef enum {
149 OVERLAP_NONE = 0,
150 OVERLAP_EQUALS_LOCK, /* OS == SS && OE == SE */
151 OVERLAP_CONTAINS_LOCK, /* OS <= SS && OE >= SE */
152 OVERLAP_CONTAINED_BY_LOCK, /* OS >= SS && OE <= SE */
153 OVERLAP_STARTS_BEFORE_LOCK, /* OS < SS && OE >= SS */
154 OVERLAP_ENDS_AFTER_LOCK /* OS > SS && OE > SE */
155 } overlap_t;
156
157 static int lf_clearlock(struct lockf *);
158 static overlap_t lf_findoverlap(struct lockf *,
159 struct lockf *, int, struct lockf ***, struct lockf **);
160 static struct lockf *lf_getblock(struct lockf *, pid_t);
161 static int lf_getlock(struct lockf *, struct flock *, pid_t);
162 static int lf_setlock(struct lockf *, struct timespec *);
163 static int lf_split(struct lockf *, struct lockf *);
164 static void lf_wakelock(struct lockf *, boolean_t);
165 #if IMPORTANCE_INHERITANCE
166 static void lf_hold_assertion(task_t, struct lockf *);
167 static void lf_jump_to_queue_head(struct lockf *, struct lockf *);
168 static void lf_drop_assertion(struct lockf *);
169 static void lf_boost_blocking_proc(struct lockf *, struct lockf *);
170 static void lf_adjust_assertion(struct lockf *block);
171 #endif /* IMPORTANCE_INHERITANCE */
172
173 static LCK_GRP_DECLARE(lf_dead_lock_grp, "lf_dead_lock");
174 static LCK_MTX_DECLARE(lf_dead_lock, &lf_dead_lock_grp);
175
176 /*
177 * lf_advlock
178 *
179 * Description: Advisory record locking support
180 *
181 * Parameters: ap Argument pointer to a vnop_advlock_args
182 * argument descriptor structure for the
183 * lock operation to be attempted.
184 *
185 * Returns: 0 Success
186 * EOVERFLOW
187 * EINVAL
188 * ENOLCK Number of locked regions exceeds limit
189 * lf_setlock:EAGAIN
190 * lf_setlock:EDEADLK
191 * lf_setlock:EINTR
192 * lf_setlock:ENOLCK
193 * lf_setlock:ETIMEDOUT
194 * lf_clearlock:ENOLCK
195 * vnode_size:???
196 *
197 * Notes: We return ENOLCK when we run out of memory to support locks; as
198 * such, there is no specific expectation limit other than the
199 * amount of available resources.
200 */
201 int
lf_advlock(struct vnop_advlock_args * ap)202 lf_advlock(struct vnop_advlock_args *ap)
203 {
204 struct vnode *vp = ap->a_vp;
205 struct flock *fl = ap->a_fl;
206 vfs_context_t context = ap->a_context;
207 struct lockf *lock;
208 off_t start, end, oadd;
209 u_quad_t size;
210 int error;
211 struct lockf **head = &vp->v_lockf;
212
213 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
214
215 /*
216 * Avoid the common case of unlocking when inode has no locks.
217 */
218 if (*head == (struct lockf *)0) {
219 if (ap->a_op != F_SETLK) {
220 fl->l_type = F_UNLCK;
221 LOCKF_DEBUG(LF_DBG_TRACE,
222 "lf_advlock: '%s' unlock without lock\n",
223 vfs_context_proc(context)->p_comm);
224 return 0;
225 }
226 }
227
228 /*
229 * Convert the flock structure into a start and end.
230 */
231 switch (fl->l_whence) {
232 case SEEK_SET:
233 case SEEK_CUR:
234 /*
235 * Caller is responsible for adding any necessary offset
236 * when SEEK_CUR is used.
237 */
238 start = fl->l_start;
239 break;
240
241 case SEEK_END:
242
243 /*
244 * It's OK to cast the u_quad_t to and off_t here, since they
245 * are the same storage size, and the value of the returned
246 * contents will never overflow into the sign bit. We need to
247 * do this because we will use size to force range checks.
248 */
249 if ((error = vnode_size(vp, (off_t *)&size, context))) {
250 LOCKF_DEBUG(LF_DBG_TRACE,
251 "lf_advlock: vnode_getattr failed: %d\n", error);
252 return error;
253 }
254
255 if (size > OFF_MAX ||
256 (fl->l_start > 0 &&
257 size > (u_quad_t)(OFF_MAX - fl->l_start))) {
258 return EOVERFLOW;
259 }
260 start = size + fl->l_start;
261 break;
262
263 default:
264 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
265 fl->l_whence);
266 return EINVAL;
267 }
268 if (start < 0) {
269 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
270 start);
271 return EINVAL;
272 }
273 if (fl->l_len < 0) {
274 if (start == 0) {
275 LOCKF_DEBUG(LF_DBG_TRACE,
276 "lf_advlock: len < 0 & start == 0\n");
277 return EINVAL;
278 }
279 end = start - 1;
280 start += fl->l_len;
281 if (start < 0) {
282 LOCKF_DEBUG(LF_DBG_TRACE,
283 "lf_advlock: start < 0 (%qd)\n", start);
284 return EINVAL;
285 }
286 } else if (fl->l_len == 0) {
287 end = -1;
288 } else {
289 oadd = fl->l_len - 1;
290 if (oadd > (off_t)(OFF_MAX - start)) {
291 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
292 return EOVERFLOW;
293 }
294 end = start + oadd;
295 }
296 /*
297 * Create the lockf structure
298 */
299 lock = zalloc_flags(KT_LOCKF, Z_WAITOK | Z_NOFAIL);
300 lock->lf_start = start;
301 lock->lf_end = end;
302 lock->lf_id = ap->a_id;
303 lock->lf_vnode = vp;
304 lock->lf_type = fl->l_type;
305 lock->lf_head = head;
306 lock->lf_next = (struct lockf *)0;
307 TAILQ_INIT(&lock->lf_blkhd);
308 lock->lf_flags = (short)ap->a_flags;
309 #if IMPORTANCE_INHERITANCE
310 lock->lf_boosted = LF_NOT_BOOSTED;
311 #endif
312 if (ap->a_flags & F_POSIX) {
313 lock->lf_owner = (struct proc *)lock->lf_id;
314 } else {
315 lock->lf_owner = NULL;
316 }
317
318 if (ap->a_flags & F_FLOCK) {
319 lock->lf_flags |= F_WAKE1_SAFE;
320 }
321
322 lck_mtx_lock(&vp->v_lock); /* protect the lockf list */
323 /*
324 * Do the requested operation.
325 */
326 switch (ap->a_op) {
327 case F_SETLK:
328 /*
329 * For F_OFD_* locks, lf_id is the fileglob.
330 * Record an "lf_owner" iff this is a confined fd
331 * i.e. it cannot escape this process and will be
332 * F_UNLCKed before the owner exits. (This is
333 * the implicit guarantee needed to ensure lf_owner
334 * remains a valid reference here.)
335 */
336 if (ap->a_flags & F_OFD_LOCK) {
337 struct fileglob *fg = (void *)lock->lf_id;
338 if (fg->fg_lflags & FG_CONFINED) {
339 lock->lf_owner = current_proc();
340 }
341 }
342 error = lf_setlock(lock, ap->a_timeout);
343 break;
344
345 case F_UNLCK:
346 error = lf_clearlock(lock);
347 zfree(KT_LOCKF, lock);
348 break;
349
350 case F_GETLK:
351 error = lf_getlock(lock, fl, -1);
352 zfree(KT_LOCKF, lock);
353 break;
354
355 case F_GETLKPID:
356 error = lf_getlock(lock, fl, fl->l_pid);
357 zfree(KT_LOCKF, lock);
358 break;
359
360 default:
361 zfree(KT_LOCKF, lock);
362 error = EINVAL;
363 break;
364 }
365 lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */
366
367 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
368 return error;
369 }
370
371 /*
372 * Empty the queue of msleeping requests for a lock on the given vnode.
373 * Called with the vnode already locked. Used for forced unmount, where
374 * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
375 * that prevents the vnode from ever being drained. Force unmounting wins.
376 */
377 void
lf_abort_advlocks(vnode_t vp)378 lf_abort_advlocks(vnode_t vp)
379 {
380 struct lockf *lock;
381
382 if ((lock = vp->v_lockf) == NULL) {
383 return;
384 }
385
386 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
387
388 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
389 struct lockf *tlock;
390
391 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
392 /*
393 * Setting this flag should cause all
394 * currently blocked F_SETLK request to
395 * return to userland with an errno.
396 */
397 tlock->lf_flags |= F_ABORT;
398 }
399 lf_wakelock(lock, TRUE);
400 }
401 }
402
403 /*
404 * Take any lock attempts which are currently blocked by a given lock ("from")
405 * and mark them as blocked by a different lock ("to"). Used in the case
406 * where a byte range currently occupied by "from" is to be occupied by "to."
407 */
408 static void
lf_move_blocked(struct lockf * to,struct lockf * from)409 lf_move_blocked(struct lockf *to, struct lockf *from)
410 {
411 struct lockf *tlock;
412
413 TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
414 tlock->lf_next = to;
415 }
416
417 TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
418 }
419
420 /*
421 * lf_coalesce_adjacent
422 *
423 * Description: Helper function: when setting a lock, coalesce adjacent
424 * locks. Needed because adjacent locks are not overlapping,
425 * but POSIX requires that they be coalesced.
426 *
427 * Parameters: lock The new lock which may be adjacent
428 * to already locked regions, and which
429 * should therefore be coalesced with them
430 *
431 * Returns: <void>
432 */
433 static void
lf_coalesce_adjacent(struct lockf * lock)434 lf_coalesce_adjacent(struct lockf *lock)
435 {
436 struct lockf **lf = lock->lf_head;
437
438 while (*lf != NOLOCKF) {
439 /* reject locks that obviously could not be coalesced */
440 if ((*lf == lock) ||
441 ((*lf)->lf_id != lock->lf_id) ||
442 ((*lf)->lf_type != lock->lf_type)) {
443 lf = &(*lf)->lf_next;
444 continue;
445 }
446
447 /*
448 * NOTE: Assumes that if two locks are adjacent on the number line
449 * and belong to the same owner, then they are adjacent on the list.
450 */
451 if (LF_END(*lf) < OFF_MAX &&
452 (LF_END(*lf) + 1) == lock->lf_start) {
453 struct lockf *adjacent = *lf;
454
455 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
456 lock->lf_start = (*lf)->lf_start;
457 *lf = lock;
458 lf = &(*lf)->lf_next;
459
460 lf_move_blocked(lock, adjacent);
461
462 zfree(KT_LOCKF, adjacent);
463 continue;
464 }
465 /* If the lock starts adjacent to us, we can coalesce it */
466 if (LF_END(lock) < OFF_MAX &&
467 (LF_END(lock) + 1) == (*lf)->lf_start) {
468 struct lockf *adjacent = *lf;
469
470 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
471 lock->lf_end = (*lf)->lf_end;
472 lock->lf_next = (*lf)->lf_next;
473 lf = &lock->lf_next;
474
475 lf_move_blocked(lock, adjacent);
476
477 zfree(KT_LOCKF, adjacent);
478 continue;
479 }
480
481 /* no matching conditions; go on to next lock */
482 lf = &(*lf)->lf_next;
483 }
484 }
485
486 /*
487 * lf_setlock
488 *
489 * Description: Set a byte-range lock.
490 *
491 * Parameters: lock The lock structure describing the lock
492 * to be set; allocated by the caller, it
493 * will be linked into the lock list if
494 * the set is successful, and freed if the
495 * set is unsuccessful.
496 *
497 * timeout Timeout specified in the case of
498 * SETLKWTIMEOUT.
499 *
500 * Returns: 0 Success
501 * EAGAIN
502 * EDEADLK
503 * lf_split:ENOLCK
504 * lf_clearlock:ENOLCK
505 * msleep:EINTR
506 * msleep:ETIMEDOUT
507 *
508 * Notes: We add the lock to the provisional lock list. We do not
509 * coalesce at this time; this has implications for other lock
510 * requestors in the blocker search mechanism.
511 */
512 static int
lf_setlock(struct lockf * lock,struct timespec * timeout)513 lf_setlock(struct lockf *lock, struct timespec *timeout)
514 {
515 struct lockf *block;
516 struct lockf **head = lock->lf_head;
517 struct lockf **prev, *overlap;
518 static const char lockstr[] = "lockf";
519 int priority, needtolink, error;
520 struct vnode *vp = lock->lf_vnode;
521 overlap_t ovcase;
522
523 #ifdef LOCKF_DEBUGGING
524 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
525 lf_print("lf_setlock", lock);
526 lf_printlist("lf_setlock(in)", lock);
527 }
528 #endif /* LOCKF_DEBUGGING */
529 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
530
531 /*
532 * Set the priority
533 */
534 priority = PLOCK;
535 if (lock->lf_type == F_WRLCK) {
536 priority += 4;
537 }
538 priority |= PCATCH;
539 scan:
540 /*
541 * Scan lock list for this file looking for locks that would block us.
542 */
543 while ((block = lf_getblock(lock, -1))) {
544 /*
545 * Free the structure and return if nonblocking.
546 */
547 if ((lock->lf_flags & F_WAIT) == 0) {
548 DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
549 zfree(KT_LOCKF, lock);
550 return EAGAIN;
551 }
552
553 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
554
555 /*
556 * We are blocked. Since flock style locks cover
557 * the whole file, there is no chance for deadlock.
558 *
559 * OFD byte-range locks currently do NOT support
560 * deadlock detection.
561 *
562 * For POSIX byte-range locks we must check for deadlock.
563 *
564 * Deadlock detection is done by looking through the
565 * wait channels to see if there are any cycles that
566 * involve us.
567 */
568 if ((lock->lf_flags & F_POSIX) &&
569 (block->lf_flags & F_POSIX)) {
570 lck_mtx_lock(&lf_dead_lock);
571
572 /* The blocked process is waiting on something */
573 struct proc *wproc = block->lf_owner;
574 proc_lock(wproc);
575
576 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
577
578 struct uthread *ut;
579 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
580 /*
581 * If the thread is (a) asleep (uu_wchan != 0)
582 * and (b) in this code (uu_wmesg == lockstr)
583 * then check to see if the lock is blocked behind
584 * someone blocked behind us.
585 *
586 * Note: (i) vp->v_lock is held, preventing other
587 * threads from mutating the blocking list for our vnode.
588 * and (ii) the proc_lock is held i.e the thread list
589 * is stable.
590 *
591 * HOWEVER some thread in wproc might be sleeping on a lockf
592 * structure for a different vnode, and be woken at any
593 * time. Thus the waitblock list could mutate while
594 * it's being inspected by this thread, and what
595 * ut->uu_wchan was just pointing at could even be freed.
596 *
597 * Nevertheless this is safe here because of lf_dead_lock; if
598 * any thread blocked with uu_wmesg == lockstr wakes (see below)
599 * it will try to acquire lf_dead_lock which is already held
600 * here. Holding that lock prevents the lockf structure being
601 * pointed at by ut->uu_wchan from going away. Thus the vnode
602 * involved can be found and locked, and the corresponding
603 * blocking chain can then be examined safely.
604 */
605 const struct lockf *waitblock = (const void *)ut->uu_wchan;
606 if ((waitblock != NULL) && (ut->uu_wmesg == lockstr)) {
607 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
608
609 vnode_t othervp = NULL;
610 if (waitblock->lf_vnode != vp) {
611 /*
612 * This thread in wproc is waiting for a lock
613 * on a different vnode; grab the lock on it
614 * that protects lf_next while we examine it.
615 */
616 othervp = waitblock->lf_vnode;
617 if (!lck_mtx_try_lock(&othervp->v_lock)) {
618 /*
619 * avoid kernel deadlock: drop all
620 * locks, pause for a bit to let the
621 * other thread do what it needs to do,
622 * then (because we drop and retake
623 * v_lock) retry the scan.
624 */
625 proc_unlock(wproc);
626 lck_mtx_unlock(&lf_dead_lock);
627 static struct timespec ts = {
628 .tv_sec = 0,
629 .tv_nsec = 2 * NSEC_PER_MSEC,
630 };
631 static const char pausestr[] = "lockf:pause";
632 (void) msleep(lock, &vp->v_lock, priority, pausestr, &ts);
633 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
634 goto scan;
635 }
636 }
637
638 /*
639 * Get the lock blocking the lock
640 * which would block us, and make
641 * certain it hasn't become unblocked
642 * (been granted, e.g. between the time
643 * we called lf_getblock, and the time
644 * we successfully acquired the
645 * proc_lock).
646 */
647 const struct lockf *nextblock = waitblock->lf_next;
648 if (nextblock == NULL) {
649 if (othervp) {
650 lck_mtx_unlock(&othervp->v_lock);
651 }
652 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with waitblock %p and no lf_next; othervp %p\n", lock, waitblock, othervp);
653 continue;
654 }
655 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, nextblock, nextblock->lf_vnode);
656
657 /*
658 * Make sure it's an advisory range
659 * lock and not any other kind of lock;
660 * if we mix lock types, it's our own
661 * fault.
662 */
663 if ((nextblock->lf_flags & F_POSIX) == 0) {
664 if (othervp) {
665 lck_mtx_unlock(&othervp->v_lock);
666 }
667 continue;
668 }
669
670 /*
671 * If the owner of the lock that's
672 * blocking a lock that's blocking us
673 * getting the requested lock, then we
674 * would deadlock, so error out.
675 */
676 struct proc *bproc = nextblock->lf_owner;
677 const boolean_t deadlocked = bproc == lock->lf_owner;
678
679 if (othervp) {
680 lck_mtx_unlock(&othervp->v_lock);
681 }
682 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
683 if (deadlocked) {
684 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
685 proc_unlock(wproc);
686 lck_mtx_unlock(&lf_dead_lock);
687 zfree(KT_LOCKF, lock);
688 return EDEADLK;
689 }
690 }
691 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
692 }
693 proc_unlock(wproc);
694 lck_mtx_unlock(&lf_dead_lock);
695 }
696
697 /*
698 * For flock type locks, we must first remove
699 * any shared locks that we hold before we sleep
700 * waiting for an exclusive lock.
701 */
702 if ((lock->lf_flags & F_FLOCK) &&
703 lock->lf_type == F_WRLCK) {
704 lock->lf_type = F_UNLCK;
705 if ((error = lf_clearlock(lock)) != 0) {
706 zfree(KT_LOCKF, lock);
707 return error;
708 }
709 lock->lf_type = F_WRLCK;
710 }
711 /*
712 * Add our lock to the blocked list and sleep until we're free.
713 * Remember who blocked us (for deadlock detection).
714 */
715 lock->lf_next = block;
716 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
717
718 if (!(lock->lf_flags & F_FLOCK)) {
719 block->lf_flags &= ~F_WAKE1_SAFE;
720 }
721
722 #if IMPORTANCE_INHERITANCE
723 /*
724 * Importance donation is done only for cases where the
725 * owning task can be unambiguously determined.
726 *
727 * POSIX type locks are not inherited by child processes;
728 * we maintain a 1:1 mapping between a lock and its owning
729 * process.
730 *
731 * Flock type locks are inherited across fork() and there is
732 * no 1:1 mapping in the general case. However, the fileglobs
733 * used by OFD locks *may* be confined to the process that
734 * created them, and thus have an "owner", in which case
735 * we also attempt importance donation.
736 */
737 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0) {
738 lf_boost_blocking_proc(lock, block);
739 } else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
740 lock->lf_owner != block->lf_owner &&
741 NULL != lock->lf_owner && NULL != block->lf_owner) {
742 lf_boost_blocking_proc(lock, block);
743 }
744 #endif /* IMPORTANCE_INHERITANCE */
745
746 #ifdef LOCKF_DEBUGGING
747 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
748 lf_print("lf_setlock: blocking on", block);
749 lf_printlist("lf_setlock(block)", block);
750 }
751 #endif /* LOCKF_DEBUGGING */
752 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
753
754 if (lock->lf_flags & F_POSIX) {
755 error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
756 /*
757 * Ensure that 'lock' doesn't get mutated or freed if a
758 * wakeup occurs while hunting for deadlocks (and holding
759 * lf_dead_lock - see above)
760 */
761 lck_mtx_lock(&lf_dead_lock);
762 lck_mtx_unlock(&lf_dead_lock);
763 } else {
764 static const char lockstr_np[] = "lockf:np";
765 error = msleep(lock, &vp->v_lock, priority, lockstr_np, timeout);
766 }
767
768 if (error == 0 && (lock->lf_flags & F_ABORT) != 0) {
769 error = EBADF;
770 }
771
772 if (lock->lf_next) {
773 /*
774 * lf_wakelock() always sets wakelock->lf_next to
775 * NULL before a wakeup; so we've been woken early
776 * - perhaps by a debugger, signal or other event.
777 *
778 * Remove 'lock' from the block list (avoids double-add
779 * in the spurious case, which would create a cycle)
780 */
781 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
782 #if IMPORTANCE_INHERITANCE
783 /*
784 * Adjust the boost on lf_next.
785 */
786 lf_adjust_assertion(lock->lf_next);
787 #endif /* IMPORTANCE_INHERITANCE */
788 lock->lf_next = NULL;
789
790 if (error == 0) {
791 /*
792 * If this was a spurious wakeup, retry
793 */
794 printf("%s: spurious wakeup, retrying lock\n",
795 __func__);
796 continue;
797 }
798 }
799
800 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
801 if ((block = lf_getblock(lock, -1)) != NULL) {
802 lf_move_blocked(block, lock);
803 }
804 }
805
806 if (error) {
807 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
808 lf_wakelock(lock, TRUE);
809 }
810 zfree(KT_LOCKF, lock);
811 /* Return ETIMEDOUT if timeout occoured. */
812 if (error == EWOULDBLOCK) {
813 error = ETIMEDOUT;
814 }
815 return error;
816 }
817 }
818
819 /*
820 * No blocks!! Add the lock. Note that we will
821 * downgrade or upgrade any overlapping locks this
822 * process already owns.
823 *
824 * Skip over locks owned by other processes.
825 * Handle any locks that overlap and are owned by ourselves.
826 */
827 prev = head;
828 block = *head;
829 needtolink = 1;
830 for (;;) {
831 const off_t lkend = LF_END(lock);
832 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
833 if (ovcase) {
834 block = overlap->lf_next;
835 }
836 /*
837 * Six cases:
838 * 0) no overlap
839 * 1) overlap == lock
840 * 2) overlap contains lock
841 * 3) lock contains overlap
842 * 4) overlap starts before lock
843 * 5) overlap ends after lock
844 */
845 switch (ovcase) {
846 case OVERLAP_NONE:
847 if (needtolink) {
848 *prev = lock;
849 lock->lf_next = overlap;
850 }
851 break;
852
853 case OVERLAP_EQUALS_LOCK:
854 /*
855 * If downgrading lock, others may be
856 * able to acquire it.
857 */
858 if (lock->lf_type == F_RDLCK &&
859 overlap->lf_type == F_WRLCK) {
860 lf_wakelock(overlap, TRUE);
861 }
862 overlap->lf_type = lock->lf_type;
863 lf_move_blocked(overlap, lock);
864 zfree(KT_LOCKF, lock);
865 lock = overlap; /* for lf_coalesce_adjacent() */
866 break;
867
868 case OVERLAP_CONTAINS_LOCK:
869 /*
870 * Check for common starting point and different types.
871 */
872 if (overlap->lf_type == lock->lf_type) {
873 lf_move_blocked(overlap, lock);
874 zfree(KT_LOCKF, lock);
875 lock = overlap; /* for lf_coalesce_adjacent() */
876 break;
877 }
878 if (overlap->lf_start == lock->lf_start) {
879 *prev = lock;
880 lock->lf_next = overlap;
881 assert(lkend < OFF_MAX);
882 overlap->lf_start = lkend + 1;
883 } else {
884 /*
885 * If we can't split the lock, we can't
886 * grant it. Claim a system limit for the
887 * resource shortage.
888 */
889 if (lf_split(overlap, lock)) {
890 zfree(KT_LOCKF, lock);
891 return ENOLCK;
892 }
893 }
894 lf_wakelock(overlap, TRUE);
895 break;
896
897 case OVERLAP_CONTAINED_BY_LOCK:
898 /*
899 * If downgrading lock, others may be able to
900 * acquire it, otherwise take the list.
901 */
902 if (lock->lf_type == F_RDLCK &&
903 overlap->lf_type == F_WRLCK) {
904 lf_wakelock(overlap, TRUE);
905 } else {
906 lf_move_blocked(lock, overlap);
907 }
908 /*
909 * Add the new lock if necessary and delete the overlap.
910 */
911 if (needtolink) {
912 *prev = lock;
913 lock->lf_next = overlap->lf_next;
914 prev = &lock->lf_next;
915 needtolink = 0;
916 } else {
917 *prev = overlap->lf_next;
918 }
919 zfree(KT_LOCKF, overlap);
920 continue;
921
922 case OVERLAP_STARTS_BEFORE_LOCK:
923 /*
924 * Add lock after overlap on the list.
925 */
926 lock->lf_next = overlap->lf_next;
927 overlap->lf_next = lock;
928 assert(lock->lf_start > 0);
929 overlap->lf_end = lock->lf_start - 1;
930 prev = &lock->lf_next;
931 lf_wakelock(overlap, TRUE);
932 needtolink = 0;
933 continue;
934
935 case OVERLAP_ENDS_AFTER_LOCK:
936 /*
937 * Add the new lock before overlap.
938 */
939 if (needtolink) {
940 *prev = lock;
941 lock->lf_next = overlap;
942 }
943 assert(lkend < OFF_MAX);
944 overlap->lf_start = lkend + 1;
945 lf_wakelock(overlap, TRUE);
946 break;
947 }
948 break;
949 }
950 /* Coalesce adjacent locks with identical attributes */
951 lf_coalesce_adjacent(lock);
952 #ifdef LOCKF_DEBUGGING
953 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
954 lf_print("lf_setlock: got the lock", lock);
955 lf_printlist("lf_setlock(out)", lock);
956 }
957 #endif /* LOCKF_DEBUGGING */
958 return 0;
959 }
960
961
962 /*
963 * lf_clearlock
964 *
965 * Description: Remove a byte-range lock on an vnode. Generally, find the
966 * lock (or an overlap to that lock) and remove it (or shrink
967 * it), then wakeup anyone we can.
968 *
969 * Parameters: unlock The lock to clear
970 *
971 * Returns: 0 Success
972 * lf_split:ENOLCK
973 *
974 * Notes: A caller may unlock all the locks owned by the caller by
975 * specifying the entire file range; locks owned by other
976 * callers are not effected by this operation.
977 */
978 static int
lf_clearlock(struct lockf * unlock)979 lf_clearlock(struct lockf *unlock)
980 {
981 struct lockf **head = unlock->lf_head;
982 struct lockf *lf = *head;
983 struct lockf *overlap, **prev;
984 overlap_t ovcase;
985
986 if (lf == NOLOCKF) {
987 return 0;
988 }
989 #ifdef LOCKF_DEBUGGING
990 if (unlock->lf_type != F_UNLCK) {
991 panic("lf_clearlock: bad type");
992 }
993 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
994 lf_print("lf_clearlock", unlock);
995 }
996 #endif /* LOCKF_DEBUGGING */
997 prev = head;
998 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
999 const off_t unlkend = LF_END(unlock);
1000 /*
1001 * Wakeup the list of locks to be retried.
1002 */
1003 lf_wakelock(overlap, FALSE);
1004 #if IMPORTANCE_INHERITANCE
1005 if (overlap->lf_boosted == LF_BOOSTED) {
1006 lf_drop_assertion(overlap);
1007 }
1008 #endif /* IMPORTANCE_INHERITANCE */
1009
1010 switch (ovcase) {
1011 case OVERLAP_NONE: /* satisfy compiler enum/switch */
1012 break;
1013
1014 case OVERLAP_EQUALS_LOCK:
1015 *prev = overlap->lf_next;
1016 zfree(KT_LOCKF, overlap);
1017 break;
1018
1019 case OVERLAP_CONTAINS_LOCK: /* split it */
1020 if (overlap->lf_start == unlock->lf_start) {
1021 assert(unlkend < OFF_MAX);
1022 overlap->lf_start = unlkend + 1;
1023 break;
1024 }
1025 /*
1026 * If we can't split the lock, we can't grant it.
1027 * Claim a system limit for the resource shortage.
1028 */
1029 if (lf_split(overlap, unlock)) {
1030 return ENOLCK;
1031 }
1032 overlap->lf_next = unlock->lf_next;
1033 break;
1034
1035 case OVERLAP_CONTAINED_BY_LOCK:
1036 *prev = overlap->lf_next;
1037 lf = overlap->lf_next;
1038 zfree(KT_LOCKF, overlap);
1039 continue;
1040
1041 case OVERLAP_STARTS_BEFORE_LOCK:
1042 assert(unlock->lf_start > 0);
1043 overlap->lf_end = unlock->lf_start - 1;
1044 prev = &overlap->lf_next;
1045 lf = overlap->lf_next;
1046 continue;
1047
1048 case OVERLAP_ENDS_AFTER_LOCK:
1049 assert(unlkend < OFF_MAX);
1050 overlap->lf_start = unlkend + 1;
1051 break;
1052 }
1053 break;
1054 }
1055 #ifdef LOCKF_DEBUGGING
1056 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1057 lf_printlist("lf_clearlock", unlock);
1058 }
1059 #endif /* LOCKF_DEBUGGING */
1060 return 0;
1061 }
1062
1063
1064 /*
1065 * lf_getlock
1066 *
1067 * Description: Check whether there is a blocking lock, and if so return
1068 * its process identifier into the lock being requested.
1069 *
1070 * Parameters: lock Pointer to lock to test for blocks
1071 * fl Pointer to flock structure to receive
1072 * the blocking lock information, if a
1073 * blocking lock is found.
1074 * matchpid -1, or pid value to match in lookup.
1075 *
1076 * Returns: 0 Success
1077 *
1078 * Implicit Returns:
1079 * *fl Contents modified to reflect the
1080 * blocking lock, if one is found; not
1081 * modified otherwise
1082 *
1083 * Notes: fl->l_pid will be (-1) for file locks and will only be set to
1084 * the blocking process ID for advisory record locks.
1085 */
1086 static int
lf_getlock(struct lockf * lock,struct flock * fl,pid_t matchpid)1087 lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
1088 {
1089 struct lockf *block;
1090
1091 #ifdef LOCKF_DEBUGGING
1092 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1093 lf_print("lf_getlock", lock);
1094 }
1095 #endif /* LOCKF_DEBUGGING */
1096
1097 if ((block = lf_getblock(lock, matchpid))) {
1098 fl->l_type = block->lf_type;
1099 fl->l_whence = SEEK_SET;
1100 fl->l_start = block->lf_start;
1101 if (block->lf_end == -1 ||
1102 (block->lf_start == 0 && LF_END(block) == OFF_MAX)) {
1103 fl->l_len = 0;
1104 } else {
1105 fl->l_len = LF_END(block) - block->lf_start + 1;
1106 }
1107 if (NULL != block->lf_owner) {
1108 /*
1109 * lf_owner is only non-NULL when the lock
1110 * "owner" can be unambiguously determined
1111 */
1112 fl->l_pid = proc_pid(block->lf_owner);
1113 } else {
1114 fl->l_pid = -1;
1115 }
1116 } else {
1117 fl->l_type = F_UNLCK;
1118 }
1119 return 0;
1120 }
1121
1122 /*
1123 * lf_getblock
1124 *
1125 * Description: Walk the list of locks for an inode and return the first
1126 * blocking lock. A lock is considered blocking if we are not
1127 * the lock owner; otherwise, we are permitted to upgrade or
1128 * downgrade it, and it's not considered blocking.
1129 *
1130 * Parameters: lock The lock for which we are interested
1131 * in obtaining the blocking lock, if any
1132 * matchpid -1, or pid value to match in lookup.
1133 *
1134 * Returns: NOLOCKF No blocking lock exists
1135 * !NOLOCKF The address of the blocking lock's
1136 * struct lockf.
1137 */
1138 static struct lockf *
lf_getblock(struct lockf * lock,pid_t matchpid)1139 lf_getblock(struct lockf *lock, pid_t matchpid)
1140 {
1141 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1142
1143 for (prev = lock->lf_head;
1144 lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1145 lf = overlap->lf_next) {
1146 /*
1147 * Found an overlap.
1148 *
1149 * If we're matching pids, and it's a record lock,
1150 * or it's an OFD lock on a process-confined fd,
1151 * but the pid doesn't match, then keep on looking ..
1152 */
1153 if (matchpid != -1 &&
1154 (overlap->lf_flags & (F_POSIX | F_OFD_LOCK)) != 0 &&
1155 proc_pid(overlap->lf_owner) != matchpid) {
1156 continue;
1157 }
1158
1159 /*
1160 * does it block us?
1161 */
1162 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) {
1163 return overlap;
1164 }
1165 }
1166 return NOLOCKF;
1167 }
1168
1169
1170 /*
1171 * lf_findoverlap
1172 *
1173 * Description: Walk the list of locks to find an overlapping lock (if any).
1174 *
1175 * Parameters: lf First lock on lock list
1176 * lock The lock we are checking for an overlap
1177 * check Check type
1178 * prev pointer to pointer pointer to contain
1179 * address of pointer to previous lock
1180 * pointer to overlapping lock, if overlap
1181 * overlap pointer to pointer to contain address
1182 * of overlapping lock
1183 *
1184 * Returns: OVERLAP_NONE
1185 * OVERLAP_EQUALS_LOCK
1186 * OVERLAP_CONTAINS_LOCK
1187 * OVERLAP_CONTAINED_BY_LOCK
1188 * OVERLAP_STARTS_BEFORE_LOCK
1189 * OVERLAP_ENDS_AFTER_LOCK
1190 *
1191 * Implicit Returns:
1192 * *prev The address of the next pointer in the
1193 * lock previous to the overlapping lock;
1194 * this is generally used to relink the
1195 * lock list, avoiding a second iteration.
1196 * *overlap The pointer to the overlapping lock
1197 * itself; this is used to return data in
1198 * the check == OTHERS case, and for the
1199 * caller to modify the overlapping lock,
1200 * in the check == SELF case
1201 *
1202 * Note: This returns only the FIRST overlapping lock. There may be
1203 * more than one. lf_getlock will return the first blocking lock,
1204 * while lf_setlock will iterate over all overlapping locks to
1205 *
1206 * The check parameter can be SELF, meaning we are looking for
1207 * overlapping locks owned by us, or it can be OTHERS, meaning
1208 * we are looking for overlapping locks owned by someone else so
1209 * we can report a blocking lock on an F_GETLK request.
1210 *
1211 * The value of *overlap and *prev are modified, even if there is
1212 * no overlapping lock found; always check the return code.
1213 */
1214 static overlap_t
lf_findoverlap(struct lockf * lf,struct lockf * lock,int type,struct lockf *** prev,struct lockf ** overlap)1215 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1216 struct lockf ***prev, struct lockf **overlap)
1217 {
1218 int found_self = 0;
1219
1220 *overlap = lf;
1221 if (lf == NOLOCKF) {
1222 return 0;
1223 }
1224 #ifdef LOCKF_DEBUGGING
1225 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1226 lf_print("lf_findoverlap: looking for overlap in", lock);
1227 }
1228 #endif /* LOCKF_DEBUGGING */
1229 const off_t start = lock->lf_start;
1230 const off_t end = LF_END(lock);
1231 while (lf != NOLOCKF) {
1232 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1233 ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1234 /*
1235 * Locks belonging to one process are adjacent on the
1236 * list, so if we've found any locks belonging to us,
1237 * and we're now seeing something else, then we've
1238 * examined all "self" locks. Note that bailing out
1239 * here is quite important; for coalescing, we assume
1240 * numerically adjacent locks from the same owner to
1241 * be adjacent on the list.
1242 */
1243 if ((type & SELF) && found_self) {
1244 return OVERLAP_NONE;
1245 }
1246
1247 *prev = &lf->lf_next;
1248 *overlap = lf = lf->lf_next;
1249 continue;
1250 }
1251
1252 if ((type & SELF)) {
1253 found_self = 1;
1254 }
1255
1256 #ifdef LOCKF_DEBUGGING
1257 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1258 lf_print("\tchecking", lf);
1259 }
1260 #endif /* LOCKF_DEBUGGING */
1261 /*
1262 * OK, check for overlap
1263 */
1264 const off_t lfstart = lf->lf_start;
1265 const off_t lfend = LF_END(lf);
1266
1267 if ((start > lfend) || (lfstart > end)) {
1268 /* Case 0 */
1269 LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1270
1271 /*
1272 * NOTE: assumes that locks for the same process are
1273 * nonintersecting and ordered.
1274 */
1275 if ((type & SELF) && lfstart > end) {
1276 return OVERLAP_NONE;
1277 }
1278 *prev = &lf->lf_next;
1279 *overlap = lf = lf->lf_next;
1280 continue;
1281 }
1282 if ((lfstart == start) && (lfend == end)) {
1283 LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1284 return OVERLAP_EQUALS_LOCK;
1285 }
1286 if ((lfstart <= start) && (lfend >= end)) {
1287 LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1288 return OVERLAP_CONTAINS_LOCK;
1289 }
1290 if ((start <= lfstart) && (end >= lfend)) {
1291 LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1292 return OVERLAP_CONTAINED_BY_LOCK;
1293 }
1294 if ((lfstart < start) && (lfend >= start)) {
1295 LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1296 return OVERLAP_STARTS_BEFORE_LOCK;
1297 }
1298 if ((lfstart > start) && (lfend > end)) {
1299 LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1300 return OVERLAP_ENDS_AFTER_LOCK;
1301 }
1302 panic("lf_findoverlap: default");
1303 }
1304 return OVERLAP_NONE;
1305 }
1306
1307
1308 /*
1309 * lf_split
1310 *
1311 * Description: Split a lock and a contained region into two or three locks
1312 * as necessary.
1313 *
1314 * Parameters: lock1 Lock to split
1315 * lock2 Overlapping lock region requiring the
1316 * split (upgrade/downgrade/unlock)
1317 *
1318 * Returns: 0 Success
1319 * ENOLCK No memory for new lock
1320 *
1321 * Implicit Returns:
1322 * *lock1 Modified original lock
1323 * *lock2 Overlapping lock (inserted into list)
1324 * (new lock) Potential new lock inserted into list
1325 * if split results in 3 locks
1326 *
1327 * Notes: This operation can only fail if the split would result in three
1328 * locks, and there is insufficient memory to allocate the third
1329 * lock; in that case, neither of the locks will be modified.
1330 */
1331 static int
lf_split(struct lockf * lock1,struct lockf * lock2)1332 lf_split(struct lockf *lock1, struct lockf *lock2)
1333 {
1334 struct lockf *splitlock;
1335
1336 #ifdef LOCKF_DEBUGGING
1337 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1338 lf_print("lf_split", lock1);
1339 lf_print("splitting from", lock2);
1340 }
1341 #endif /* LOCKF_DEBUGGING */
1342 /*
1343 * Check to see if splitting into only two pieces.
1344 */
1345 if (lock1->lf_start == lock2->lf_start) {
1346 assert(LF_END(lock2) < OFF_MAX);
1347 lock1->lf_start = LF_END(lock2) + 1;
1348 lock2->lf_next = lock1;
1349 return 0;
1350 }
1351 if (LF_END(lock1) == LF_END(lock2)) {
1352 assert(lock2->lf_start > 0);
1353 lock1->lf_end = lock2->lf_start - 1;
1354 lock2->lf_next = lock1->lf_next;
1355 lock1->lf_next = lock2;
1356 return 0;
1357 }
1358 /*
1359 * Make a new lock consisting of the last part of
1360 * the encompassing lock
1361 */
1362 splitlock = zalloc_flags(KT_LOCKF, Z_WAITOK | Z_NOFAIL);
1363 bcopy(lock1, splitlock, sizeof *splitlock);
1364 assert(LF_END(lock2) < OFF_MAX);
1365 splitlock->lf_start = LF_END(lock2) + 1;
1366 TAILQ_INIT(&splitlock->lf_blkhd);
1367 assert(lock2->lf_start > 0);
1368 lock1->lf_end = lock2->lf_start - 1;
1369 /*
1370 * OK, now link it in
1371 */
1372 splitlock->lf_next = lock1->lf_next;
1373 lock2->lf_next = splitlock;
1374 lock1->lf_next = lock2;
1375
1376 return 0;
1377 }
1378
1379
1380 /*
1381 * lf_wakelock
1382 *
1383 * Wakeup a blocklist in the case of a downgrade or unlock, since others
1384 * waiting on the lock may now be able to acquire it.
1385 *
1386 * Parameters: listhead Lock list head on which waiters may
1387 * have pending locks
1388 *
1389 * Returns: <void>
1390 *
1391 * Notes: This function iterates a list of locks and wakes all waiters,
1392 * rather than only waiters for the contended regions. Because
1393 * of this, for heavily contended files, this can result in a
1394 * "thundering herd" situation. Refactoring the code could make
1395 * this operation more efficient, if heavy contention ever results
1396 * in a real-world performance problem.
1397 */
1398 static void
lf_wakelock(struct lockf * listhead,boolean_t force_all)1399 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1400 {
1401 struct lockf *wakelock;
1402 boolean_t wake_all = TRUE;
1403
1404 if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) {
1405 wake_all = FALSE;
1406 }
1407
1408 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1409 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1410 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1411
1412 wakelock->lf_next = NOLOCKF;
1413 #ifdef LOCKF_DEBUGGING
1414 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1415 lf_print("lf_wakelock: awakening", wakelock);
1416 }
1417 #endif /* LOCKF_DEBUGGING */
1418 if (wake_all == FALSE) {
1419 /*
1420 * If there are items on the list head block list,
1421 * move them to the wakelock list instead, and then
1422 * correct their lf_next pointers.
1423 */
1424 if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1425 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1426
1427 struct lockf *tlock;
1428
1429 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1430 if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1431 /* See rdar://10887303 */
1432 panic("cycle in wakelock list");
1433 }
1434 tlock->lf_next = wakelock;
1435 }
1436 }
1437 }
1438 wakeup(wakelock);
1439
1440 if (wake_all == FALSE) {
1441 break;
1442 }
1443 }
1444 }
1445
1446
1447 #ifdef LOCKF_DEBUGGING
1448 #define GET_LF_OWNER_PID(lf) (proc_pid((lf)->lf_owner))
1449
1450 /*
1451 * lf_print DEBUG
1452 *
1453 * Print out a lock; lock information is prefixed by the string in 'tag'
1454 *
1455 * Parameters: tag A string tag for debugging
1456 * lock The lock whose information should be
1457 * displayed
1458 *
1459 * Returns: <void>
1460 */
1461 void
lf_print(const char * tag,struct lockf * lock)1462 lf_print(const char *tag, struct lockf *lock)
1463 {
1464 printf("%s: lock %p for ", tag, (void *)lock);
1465 if (lock->lf_flags & F_POSIX) {
1466 printf("proc %p (owner %d)",
1467 lock->lf_id, GET_LF_OWNER_PID(lock));
1468 } else if (lock->lf_flags & F_OFD_LOCK) {
1469 printf("fg %p (owner %d)",
1470 lock->lf_id, GET_LF_OWNER_PID(lock));
1471 } else {
1472 printf("id %p", (void *)lock->lf_id);
1473 }
1474 if (lock->lf_vnode != 0) {
1475 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1476 lock->lf_vnode,
1477 lock->lf_type == F_RDLCK ? "shared" :
1478 lock->lf_type == F_WRLCK ? "exclusive" :
1479 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1480 (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1481 } else {
1482 printf(" %s, start 0x%016llx, end 0x%016llx",
1483 lock->lf_type == F_RDLCK ? "shared" :
1484 lock->lf_type == F_WRLCK ? "exclusive" :
1485 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1486 (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1487 }
1488 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
1489 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1490 } else {
1491 printf("\n");
1492 }
1493 }
1494
1495
1496 /*
1497 * lf_printlist DEBUG
1498 *
1499 * Print out a lock list for the vnode associated with 'lock'; lock information
1500 * is prefixed by the string in 'tag'
1501 *
1502 * Parameters: tag A string tag for debugging
1503 * lock The lock whose vnode's lock list should
1504 * be displayed
1505 *
1506 * Returns: <void>
1507 */
1508 void
lf_printlist(const char * tag,struct lockf * lock)1509 lf_printlist(const char *tag, struct lockf *lock)
1510 {
1511 struct lockf *lf, *blk;
1512
1513 if (lock->lf_vnode == 0) {
1514 return;
1515 }
1516
1517 printf("%s: Lock list for vno %p:\n",
1518 tag, lock->lf_vnode);
1519 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1520 printf("\tlock %p for ", (void *)lf);
1521 if (lf->lf_flags & F_POSIX) {
1522 printf("proc %p (owner %d)",
1523 lf->lf_id, GET_LF_OWNER_PID(lf));
1524 } else if (lf->lf_flags & F_OFD_LOCK) {
1525 printf("fg %p (owner %d)",
1526 lf->lf_id, GET_LF_OWNER_PID(lf));
1527 } else {
1528 printf("id %p", (void *)lf->lf_id);
1529 }
1530 printf(", %s, start 0x%016llx, end 0x%016llx",
1531 lf->lf_type == F_RDLCK ? "shared" :
1532 lf->lf_type == F_WRLCK ? "exclusive" :
1533 lf->lf_type == F_UNLCK ? "unlock" :
1534 "unknown", (uint64_t)lf->lf_start, (uint64_t)lf->lf_end);
1535 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1536 printf("\n\t\tlock request %p for ", (void *)blk);
1537 if (blk->lf_flags & F_POSIX) {
1538 printf("proc %p (owner %d)",
1539 blk->lf_id, GET_LF_OWNER_PID(blk));
1540 } else if (blk->lf_flags & F_OFD_LOCK) {
1541 printf("fg %p (owner %d)",
1542 blk->lf_id, GET_LF_OWNER_PID(blk));
1543 } else {
1544 printf("id %p", (void *)blk->lf_id);
1545 }
1546 printf(", %s, start 0x%016llx, end 0x%016llx",
1547 blk->lf_type == F_RDLCK ? "shared" :
1548 blk->lf_type == F_WRLCK ? "exclusive" :
1549 blk->lf_type == F_UNLCK ? "unlock" :
1550 "unknown", (uint64_t)blk->lf_start,
1551 (uint64_t)blk->lf_end);
1552 if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
1553 panic("lf_printlist: bad list");
1554 }
1555 }
1556 printf("\n");
1557 }
1558 }
1559 #endif /* LOCKF_DEBUGGING */
1560
1561 #if IMPORTANCE_INHERITANCE
1562
1563 /*
1564 * lf_hold_assertion
1565 *
1566 * Call task importance hold assertion on the owner of the lock.
1567 *
1568 * Parameters: block_task Owner of the lock blocking
1569 * current thread.
1570 *
1571 * block lock on which the current thread
1572 * is blocking on.
1573 *
1574 * Returns: <void>
1575 *
1576 * Notes: The task reference on block_task is not needed to be hold since
1577 * the current thread has vnode lock and block_task has a file
1578 * lock, thus removing file lock in exit requires block_task to
1579 * grab the vnode lock.
1580 */
1581 static void
lf_hold_assertion(task_t block_task,struct lockf * block)1582 lf_hold_assertion(task_t block_task, struct lockf *block)
1583 {
1584 if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) {
1585 block->lf_boosted = LF_BOOSTED;
1586 LOCKF_DEBUG(LF_DBG_IMPINH,
1587 "lf: importance hold file lock assert on pid %d lock %p\n",
1588 proc_pid(block->lf_owner), block);
1589 }
1590 }
1591
1592
1593 /*
1594 * lf_jump_to_queue_head
1595 *
1596 * Jump the lock from the tail of the block queue to the head of
1597 * the queue.
1598 *
1599 * Parameters: block lockf struct containing the
1600 * block queue.
1601 * lock lockf struct to be jumped to the
1602 * front.
1603 *
1604 * Returns: <void>
1605 */
1606 static void
lf_jump_to_queue_head(struct lockf * block,struct lockf * lock)1607 lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1608 {
1609 /* Move the lock to the head of the block queue. */
1610 TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1611 TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1612 }
1613
1614
1615 /*
1616 * lf_drop_assertion
1617 *
1618 * Drops the task hold assertion.
1619 *
1620 * Parameters: block lockf struct holding the assertion.
1621 *
1622 * Returns: <void>
1623 */
1624 static void
lf_drop_assertion(struct lockf * block)1625 lf_drop_assertion(struct lockf *block)
1626 {
1627 LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1628 proc_pid(block->lf_owner), block);
1629
1630 task_t current_task = proc_task(block->lf_owner);
1631 task_importance_drop_file_lock_assertion(current_task, 1);
1632 block->lf_boosted = LF_NOT_BOOSTED;
1633 }
1634
1635 /*
1636 * lf_adjust_assertion
1637 *
1638 * Adjusts importance assertion of file lock. Goes through
1639 * all the blocking locks and checks if the file lock needs
1640 * to be boosted anymore.
1641 *
1642 * Parameters: block lockf structure which needs to be adjusted.
1643 *
1644 * Returns: <void>
1645 */
1646 static void
lf_adjust_assertion(struct lockf * block)1647 lf_adjust_assertion(struct lockf *block)
1648 {
1649 boolean_t drop_boost = TRUE;
1650 struct lockf *next;
1651
1652 /* Return if the lock is not boosted */
1653 if (block->lf_boosted == LF_NOT_BOOSTED) {
1654 return;
1655 }
1656
1657 TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
1658 /* Check if block and next are same type of locks */
1659 if (((block->lf_flags & next->lf_flags & F_POSIX) != 0) ||
1660 ((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
1661 (block->lf_owner != next->lf_owner) &&
1662 (NULL != block->lf_owner && NULL != next->lf_owner))) {
1663 /* Check if next would be boosting block */
1664 if (task_is_importance_donor(proc_task(next->lf_owner)) &&
1665 task_is_importance_receiver_type(proc_task(block->lf_owner))) {
1666 /* Found a lock boosting block */
1667 drop_boost = FALSE;
1668 break;
1669 }
1670 }
1671 }
1672
1673 if (drop_boost) {
1674 lf_drop_assertion(block);
1675 }
1676 }
1677
1678 static void
lf_boost_blocking_proc(struct lockf * lock,struct lockf * block)1679 lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1680 {
1681 task_t ltask = proc_task(lock->lf_owner);
1682 task_t btask = proc_task(block->lf_owner);
1683
1684 /*
1685 * Check if ltask can donate importance. The
1686 * check of imp_donor bit is done without holding
1687 * any lock. The value may change after you read it,
1688 * but it is ok to boost a task while someone else is
1689 * unboosting you.
1690 *
1691 * TODO: Support live inheritance on file locks.
1692 */
1693 if (task_is_importance_donor(ltask)) {
1694 LOCKF_DEBUG(LF_DBG_IMPINH,
1695 "lf: %d: attempt to boost pid %d that holds lock %p\n",
1696 proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1697
1698 if (block->lf_boosted != LF_BOOSTED &&
1699 task_is_importance_receiver_type(btask)) {
1700 lf_hold_assertion(btask, block);
1701 }
1702 lf_jump_to_queue_head(block, lock);
1703 }
1704 }
1705 #endif /* IMPORTANCE_INHERITANCE */
1706