1 /*
2 * Copyright (c) 2019-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * This code is derived from software contributed to Berkeley by
33 * Scooter Morris at Genentech Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
60 */
61
62 #include <sys/cdefs.h>
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/lock.h>
67 #include <sys/mount.h>
68 #include <sys/proc.h>
69 #include <sys/signalvar.h>
70 #include <sys/unistd.h>
71 #include <sys/user.h>
72 #include <sys/vnode.h>
73 #include <sys/vnode_internal.h>
74 #include <sys/vnode_if.h>
75 #include <sys/malloc.h>
76 #include <sys/fcntl.h>
77 #include <sys/lockf.h>
78 #include <sys/sdt.h>
79 #include <kern/policy_internal.h>
80
81 #include <sys/file_internal.h>
82
83 #if (DEVELOPMENT || DEBUG)
84 #define LOCKF_DEBUGGING 1
85 #endif
86
87 #ifdef LOCKF_DEBUGGING
88 #include <sys/sysctl.h>
89 void lf_print(const char *tag, struct lockf *lock);
90 void lf_printlist(const char *tag, struct lockf *lock);
91
92 #define LF_DBG_LOCKOP (1 << 0) /* setlk, getlk, clearlk */
93 #define LF_DBG_LIST (1 << 1) /* split, coalesce */
94 #define LF_DBG_IMPINH (1 << 2) /* importance inheritance */
95 #define LF_DBG_TRACE (1 << 3) /* errors, exit */
96 #define LF_DBG_DEADLOCK (1 << 4) /* deadlock detection */
97
98 static int lockf_debug = 0; /* was 2, could be 3 ;-) */
99 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
100
101 /*
102 * If the selector is set, then output the debugging diagnostic.
103 */
104 #define LOCKF_DEBUG(mask, ...) \
105 do { \
106 if ((mask) & lockf_debug) { \
107 printf("%s>", __FUNCTION__); \
108 printf(__VA_ARGS__); \
109 } \
110 } while(0)
111
112 #define LOCKF_DEBUGP(mask) \
113 ({ \
114 ((mask) & lockf_debug); \
115 })
116 #else /* !LOCKF_DEBUGGING */
117 #define LOCKF_DEBUG(mask, ...) /* mask */
118 #endif /* !LOCKF_DEBUGGING */
119
120 KALLOC_TYPE_DEFINE(KT_LOCKF, struct lockf, KT_PRIV_ACCT);
121
122 #define NOLOCKF (struct lockf *)0
123 #define SELF 0x1
124 #define OTHERS 0x2
125 #define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
126
127 /* return the effective end of a 'struct lockf': lf_end == -1 is OFF_MAX */
128 #define LF_END(l) ((l)->lf_end == -1 ? OFF_MAX : (l)->lf_end)
129
130 /*
131 * Overlapping lock states
132 *
133 * For lk_find_overlap(..., SELF, ...), the possible sequences are a single:
134 * - OVERLAP_NONE,
135 * - OVERLAP_EQUALS_LOCK, or
136 * - OVERLAP_CONTAINS_LOCK
137 *
138 * or the following sequence:
139 * - optional OVERLAP_STARTS_BEFORE_LOCK
140 * - zero or more OVERLAP_CONTAINED_BY_LOCK
141 * - optional OVERLAP_ENDS_AFTER_LOCK
142 * - OVERLAP_NONE
143 *
144 * In the annotations:
145 * - the search lock is [SS, SE] and
146 * - the returned overlap lock is [OS,OE].
147 */
148 typedef enum {
149 OVERLAP_NONE = 0,
150 OVERLAP_EQUALS_LOCK, /* OS == SS && OE == SE */
151 OVERLAP_CONTAINS_LOCK, /* OS <= SS && OE >= SE */
152 OVERLAP_CONTAINED_BY_LOCK, /* OS >= SS && OE <= SE */
153 OVERLAP_STARTS_BEFORE_LOCK, /* OS < SS && OE >= SS */
154 OVERLAP_ENDS_AFTER_LOCK /* OS > SS && OE > SE */
155 } overlap_t;
156
157 static int lf_clearlock(struct lockf *);
158 static int lf_transferlock(struct lockf *);
159 static overlap_t lf_findoverlap(struct lockf *,
160 struct lockf *, int, struct lockf ***, struct lockf **);
161 static struct lockf *lf_getblock(struct lockf *, pid_t);
162 static int lf_getlock(struct lockf *, struct flock *, pid_t);
163 static int lf_setlock(struct lockf *, struct timespec *);
164 static int lf_split(struct lockf *, struct lockf *);
165 static void lf_wakelock(struct lockf *, boolean_t);
166 #if IMPORTANCE_INHERITANCE
167 static void lf_hold_assertion(task_t, struct lockf *);
168 static void lf_jump_to_queue_head(struct lockf *, struct lockf *);
169 static void lf_drop_assertion(struct lockf *);
170 static void lf_boost_blocking_proc(struct lockf *, struct lockf *);
171 static void lf_adjust_assertion(struct lockf *block);
172 #endif /* IMPORTANCE_INHERITANCE */
173
174 static LCK_GRP_DECLARE(lf_dead_lock_grp, "lf_dead_lock");
175 static LCK_MTX_DECLARE(lf_dead_lock, &lf_dead_lock_grp);
176
177 /*
178 * lf_advlock
179 *
180 * Description: Advisory record locking support
181 *
182 * Parameters: ap Argument pointer to a vnop_advlock_args
183 * argument descriptor structure for the
184 * lock operation to be attempted.
185 *
186 * Returns: 0 Success
187 * EOVERFLOW
188 * EINVAL
189 * ENOLCK Number of locked regions exceeds limit
190 * lf_setlock:EAGAIN
191 * lf_setlock:EDEADLK
192 * lf_setlock:EINTR
193 * lf_setlock:ENOLCK
194 * lf_setlock:ETIMEDOUT
195 * lf_clearlock:ENOLCK
196 * vnode_size:???
197 *
198 * Notes: We return ENOLCK when we run out of memory to support locks; as
199 * such, there is no specific expectation limit other than the
200 * amount of available resources.
201 */
202 int
lf_advlock(struct vnop_advlock_args * ap)203 lf_advlock(struct vnop_advlock_args *ap)
204 {
205 struct vnode *vp = ap->a_vp;
206 struct flock *fl = ap->a_fl;
207 vfs_context_t context = ap->a_context;
208 struct lockf *lock;
209 off_t start, end, oadd;
210 u_quad_t size;
211 int error;
212 struct lockf **head = &vp->v_lockf;
213
214 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
215
216 /*
217 * Avoid the common case of unlocking when inode has no locks.
218 */
219 if (*head == (struct lockf *)0) {
220 if (ap->a_op != F_SETLK) {
221 fl->l_type = F_UNLCK;
222 LOCKF_DEBUG(LF_DBG_TRACE,
223 "lf_advlock: '%s' unlock without lock\n",
224 vfs_context_proc(context)->p_comm);
225 return 0;
226 }
227 }
228
229 /*
230 * Convert the flock structure into a start and end.
231 */
232 switch (fl->l_whence) {
233 case SEEK_SET:
234 case SEEK_CUR:
235 /*
236 * Caller is responsible for adding any necessary offset
237 * when SEEK_CUR is used.
238 */
239 start = fl->l_start;
240 break;
241
242 case SEEK_END:
243
244 /*
245 * It's OK to cast the u_quad_t to and off_t here, since they
246 * are the same storage size, and the value of the returned
247 * contents will never overflow into the sign bit. We need to
248 * do this because we will use size to force range checks.
249 */
250 if ((error = vnode_size(vp, (off_t *)&size, context))) {
251 LOCKF_DEBUG(LF_DBG_TRACE,
252 "lf_advlock: vnode_getattr failed: %d\n", error);
253 return error;
254 }
255
256 if (size > OFF_MAX ||
257 (fl->l_start > 0 &&
258 size > (u_quad_t)(OFF_MAX - fl->l_start))) {
259 return EOVERFLOW;
260 }
261 start = size + fl->l_start;
262 break;
263
264 default:
265 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
266 fl->l_whence);
267 return EINVAL;
268 }
269 if (start < 0) {
270 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
271 start);
272 return EINVAL;
273 }
274 if (fl->l_len < 0) {
275 if (start == 0) {
276 LOCKF_DEBUG(LF_DBG_TRACE,
277 "lf_advlock: len < 0 & start == 0\n");
278 return EINVAL;
279 }
280 end = start - 1;
281 start += fl->l_len;
282 if (start < 0) {
283 LOCKF_DEBUG(LF_DBG_TRACE,
284 "lf_advlock: start < 0 (%qd)\n", start);
285 return EINVAL;
286 }
287 } else if (fl->l_len == 0) {
288 end = -1;
289 } else {
290 oadd = fl->l_len - 1;
291 if (oadd > (off_t)(OFF_MAX - start)) {
292 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
293 return EOVERFLOW;
294 }
295 end = start + oadd;
296 }
297 /*
298 * Create the lockf structure
299 */
300 lock = zalloc_flags(KT_LOCKF, Z_WAITOK | Z_NOFAIL);
301 lock->lf_start = start;
302 lock->lf_end = end;
303 lock->lf_id = ap->a_id;
304 lock->lf_vnode = vp;
305 lock->lf_type = fl->l_type;
306 lock->lf_head = head;
307 lock->lf_next = (struct lockf *)0;
308 TAILQ_INIT(&lock->lf_blkhd);
309 lock->lf_flags = (short)ap->a_flags;
310 #if IMPORTANCE_INHERITANCE
311 lock->lf_boosted = LF_NOT_BOOSTED;
312 #endif
313 if (ap->a_flags & F_POSIX) {
314 lock->lf_owner = (struct proc *)lock->lf_id;
315 } else {
316 lock->lf_owner = NULL;
317 }
318
319 if (ap->a_flags & F_FLOCK) {
320 lock->lf_flags |= F_WAKE1_SAFE;
321 }
322
323 lck_mtx_lock(&vp->v_lock); /* protect the lockf list */
324 /*
325 * Do the requested operation.
326 */
327 switch (ap->a_op) {
328 case F_SETLK:
329 /*
330 * For F_OFD_* locks, lf_id is the fileglob.
331 * Record an "lf_owner" iff this is a confined fd
332 * i.e. it cannot escape this process and will be
333 * F_UNLCKed before the owner exits. (This is
334 * the implicit guarantee needed to ensure lf_owner
335 * remains a valid reference here.)
336 */
337 if (ap->a_flags & F_OFD_LOCK) {
338 struct fileglob *fg = (void *)lock->lf_id;
339 if (fg->fg_lflags & FG_CONFINED) {
340 lock->lf_owner = current_proc();
341 }
342 }
343 error = lf_setlock(lock, ap->a_timeout);
344 break;
345
346 case F_UNLCK:
347 error = lf_clearlock(lock);
348 zfree(KT_LOCKF, lock);
349 break;
350
351 case F_TRANSFER:
352 /*
353 * The new owner is passed in the context, set the new owner
354 * in the lf_owner field.
355 */
356 lock->lf_owner = vfs_context_proc(context);
357 assert(lock->lf_owner != current_proc());
358 error = lf_transferlock(lock);
359 zfree(KT_LOCKF, lock);
360 break;
361
362 case F_GETLK:
363 error = lf_getlock(lock, fl, -1);
364 zfree(KT_LOCKF, lock);
365 break;
366
367 case F_GETLKPID:
368 error = lf_getlock(lock, fl, fl->l_pid);
369 zfree(KT_LOCKF, lock);
370 break;
371
372 default:
373 zfree(KT_LOCKF, lock);
374 error = EINVAL;
375 break;
376 }
377 lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */
378
379 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
380 return error;
381 }
382
383 /*
384 * Empty the queue of msleeping requests for a lock on the given vnode.
385 * Called with the vnode already locked. Used for forced unmount, where
386 * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
387 * that prevents the vnode from ever being drained. Force unmounting wins.
388 */
389 void
lf_abort_advlocks(vnode_t vp)390 lf_abort_advlocks(vnode_t vp)
391 {
392 struct lockf *lock;
393
394 if ((lock = vp->v_lockf) == NULL) {
395 return;
396 }
397
398 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
399
400 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
401 struct lockf *tlock;
402
403 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
404 /*
405 * Setting this flag should cause all
406 * currently blocked F_SETLK request to
407 * return to userland with an errno.
408 */
409 tlock->lf_flags |= F_ABORT;
410 }
411 lf_wakelock(lock, TRUE);
412 }
413 }
414
415 /*
416 * Take any lock attempts which are currently blocked by a given lock ("from")
417 * and mark them as blocked by a different lock ("to"). Used in the case
418 * where a byte range currently occupied by "from" is to be occupied by "to."
419 */
420 static void
lf_move_blocked(struct lockf * to,struct lockf * from)421 lf_move_blocked(struct lockf *to, struct lockf *from)
422 {
423 struct lockf *tlock;
424
425 TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
426 tlock->lf_next = to;
427 }
428
429 TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
430 }
431
432 /*
433 * lf_coalesce_adjacent
434 *
435 * Description: Helper function: when setting a lock, coalesce adjacent
436 * locks. Needed because adjacent locks are not overlapping,
437 * but POSIX requires that they be coalesced.
438 *
439 * Parameters: lock The new lock which may be adjacent
440 * to already locked regions, and which
441 * should therefore be coalesced with them
442 *
443 * Returns: <void>
444 */
445 static void
lf_coalesce_adjacent(struct lockf * lock)446 lf_coalesce_adjacent(struct lockf *lock)
447 {
448 struct lockf **lf = lock->lf_head;
449
450 while (*lf != NOLOCKF) {
451 /* reject locks that obviously could not be coalesced */
452 if ((*lf == lock) ||
453 ((*lf)->lf_id != lock->lf_id) ||
454 ((*lf)->lf_type != lock->lf_type)) {
455 lf = &(*lf)->lf_next;
456 continue;
457 }
458
459 /*
460 * NOTE: Assumes that if two locks are adjacent on the number line
461 * and belong to the same owner, then they are adjacent on the list.
462 */
463 if (LF_END(*lf) < OFF_MAX &&
464 (LF_END(*lf) + 1) == lock->lf_start) {
465 struct lockf *adjacent = *lf;
466
467 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
468 lock->lf_start = (*lf)->lf_start;
469 *lf = lock;
470 lf = &(*lf)->lf_next;
471
472 lf_move_blocked(lock, adjacent);
473
474 zfree(KT_LOCKF, adjacent);
475 continue;
476 }
477 /* If the lock starts adjacent to us, we can coalesce it */
478 if (LF_END(lock) < OFF_MAX &&
479 (LF_END(lock) + 1) == (*lf)->lf_start) {
480 struct lockf *adjacent = *lf;
481
482 LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
483 lock->lf_end = (*lf)->lf_end;
484 lock->lf_next = (*lf)->lf_next;
485 lf = &lock->lf_next;
486
487 lf_move_blocked(lock, adjacent);
488
489 zfree(KT_LOCKF, adjacent);
490 continue;
491 }
492
493 /* no matching conditions; go on to next lock */
494 lf = &(*lf)->lf_next;
495 }
496 }
497
498 /*
499 * lf_setlock
500 *
501 * Description: Set a byte-range lock.
502 *
503 * Parameters: lock The lock structure describing the lock
504 * to be set; allocated by the caller, it
505 * will be linked into the lock list if
506 * the set is successful, and freed if the
507 * set is unsuccessful.
508 *
509 * timeout Timeout specified in the case of
510 * SETLKWTIMEOUT.
511 *
512 * Returns: 0 Success
513 * EAGAIN
514 * EDEADLK
515 * lf_split:ENOLCK
516 * lf_clearlock:ENOLCK
517 * msleep:EINTR
518 * msleep:ETIMEDOUT
519 *
520 * Notes: We add the lock to the provisional lock list. We do not
521 * coalesce at this time; this has implications for other lock
522 * requestors in the blocker search mechanism.
523 */
524 static int
lf_setlock(struct lockf * lock,struct timespec * timeout)525 lf_setlock(struct lockf *lock, struct timespec *timeout)
526 {
527 struct lockf *block;
528 struct lockf **head = lock->lf_head;
529 struct lockf **prev, *overlap;
530 static const char lockstr[] = "lockf";
531 int priority, needtolink, error;
532 struct vnode *vp = lock->lf_vnode;
533 overlap_t ovcase;
534
535 #ifdef LOCKF_DEBUGGING
536 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
537 lf_print("lf_setlock", lock);
538 lf_printlist("lf_setlock(in)", lock);
539 }
540 #endif /* LOCKF_DEBUGGING */
541 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p Looking for deadlock, vnode %p\n", lock, lock->lf_vnode);
542
543 /*
544 * Set the priority
545 */
546 priority = PLOCK;
547 if (lock->lf_type == F_WRLCK) {
548 priority += 4;
549 }
550 priority |= PCATCH;
551 scan:
552 /*
553 * Scan lock list for this file looking for locks that would block us.
554 */
555 while ((block = lf_getblock(lock, -1))) {
556 /*
557 * Free the structure and return if nonblocking.
558 */
559 if ((lock->lf_flags & F_WAIT) == 0) {
560 DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
561 zfree(KT_LOCKF, lock);
562 return EAGAIN;
563 }
564
565 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p found blocking lock %p\n", lock, block);
566
567 /*
568 * We are blocked. Since flock style locks cover
569 * the whole file, there is no chance for deadlock.
570 *
571 * OFD byte-range locks currently do NOT support
572 * deadlock detection.
573 *
574 * For POSIX byte-range locks we must check for deadlock.
575 *
576 * Deadlock detection is done by looking through the
577 * wait channels to see if there are any cycles that
578 * involve us.
579 */
580 if ((lock->lf_flags & F_POSIX) &&
581 (block->lf_flags & F_POSIX)) {
582 lck_mtx_lock(&lf_dead_lock);
583
584 /* The blocked process is waiting on something */
585 struct proc *wproc = block->lf_owner;
586 proc_lock(wproc);
587
588 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(wproc));
589
590 struct uthread *ut;
591 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
592 /*
593 * If the thread is (a) asleep (uu_wchan != 0)
594 * and (b) in this code (uu_wmesg == lockstr)
595 * then check to see if the lock is blocked behind
596 * someone blocked behind us.
597 *
598 * Note: (i) vp->v_lock is held, preventing other
599 * threads from mutating the blocking list for our vnode.
600 * and (ii) the proc_lock is held i.e the thread list
601 * is stable.
602 *
603 * HOWEVER some thread in wproc might be sleeping on a lockf
604 * structure for a different vnode, and be woken at any
605 * time. Thus the waitblock list could mutate while
606 * it's being inspected by this thread, and what
607 * ut->uu_wchan was just pointing at could even be freed.
608 *
609 * Nevertheless this is safe here because of lf_dead_lock; if
610 * any thread blocked with uu_wmesg == lockstr wakes (see below)
611 * it will try to acquire lf_dead_lock which is already held
612 * here. Holding that lock prevents the lockf structure being
613 * pointed at by ut->uu_wchan from going away. Thus the vnode
614 * involved can be found and locked, and the corresponding
615 * blocking chain can then be examined safely.
616 */
617 const struct lockf *waitblock = (const void *)ut->uu_wchan;
618 if ((waitblock != NULL) && (ut->uu_wmesg == lockstr)) {
619 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, waitblock, waitblock->lf_vnode);
620
621 vnode_t othervp = NULL;
622 if (waitblock->lf_vnode != vp) {
623 /*
624 * This thread in wproc is waiting for a lock
625 * on a different vnode; grab the lock on it
626 * that protects lf_next while we examine it.
627 */
628 othervp = waitblock->lf_vnode;
629 if (!lck_mtx_try_lock(&othervp->v_lock)) {
630 /*
631 * avoid kernel deadlock: drop all
632 * locks, pause for a bit to let the
633 * other thread do what it needs to do,
634 * then (because we drop and retake
635 * v_lock) retry the scan.
636 */
637 proc_unlock(wproc);
638 lck_mtx_unlock(&lf_dead_lock);
639 static struct timespec ts = {
640 .tv_sec = 0,
641 .tv_nsec = 2 * NSEC_PER_MSEC,
642 };
643 static const char pausestr[] = "lockf:pause";
644 (void) msleep(lock, &vp->v_lock, priority, pausestr, &ts);
645 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p contention for vp %p => restart\n", lock, othervp);
646 goto scan;
647 }
648 }
649
650 /*
651 * Get the lock blocking the lock
652 * which would block us, and make
653 * certain it hasn't become unblocked
654 * (been granted, e.g. between the time
655 * we called lf_getblock, and the time
656 * we successfully acquired the
657 * proc_lock).
658 */
659 const struct lockf *nextblock = waitblock->lf_next;
660 if (nextblock == NULL) {
661 if (othervp) {
662 lck_mtx_unlock(&othervp->v_lock);
663 }
664 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p with waitblock %p and no lf_next; othervp %p\n", lock, waitblock, othervp);
665 continue;
666 }
667 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is also blocked on lock %p vnode %p\n", lock, nextblock, nextblock->lf_vnode);
668
669 /*
670 * Make sure it's an advisory range
671 * lock and not any other kind of lock;
672 * if we mix lock types, it's our own
673 * fault.
674 */
675 if ((nextblock->lf_flags & F_POSIX) == 0) {
676 if (othervp) {
677 lck_mtx_unlock(&othervp->v_lock);
678 }
679 continue;
680 }
681
682 /*
683 * If the owner of the lock that's
684 * blocking a lock that's blocking us
685 * getting the requested lock, then we
686 * would deadlock, so error out.
687 */
688 struct proc *bproc = nextblock->lf_owner;
689 const boolean_t deadlocked = bproc == lock->lf_owner;
690
691 if (othervp) {
692 lck_mtx_unlock(&othervp->v_lock);
693 }
694 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p owned by pid %d\n", lock, proc_pid(bproc));
695 if (deadlocked) {
696 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock);
697 proc_unlock(wproc);
698 lck_mtx_unlock(&lf_dead_lock);
699 zfree(KT_LOCKF, lock);
700 return EDEADLK;
701 }
702 }
703 LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p bottom of thread loop\n", lock);
704 }
705 proc_unlock(wproc);
706 lck_mtx_unlock(&lf_dead_lock);
707 }
708
709 /*
710 * For flock type locks, we must first remove
711 * any shared locks that we hold before we sleep
712 * waiting for an exclusive lock.
713 */
714 if ((lock->lf_flags & F_FLOCK) &&
715 lock->lf_type == F_WRLCK) {
716 lock->lf_type = F_UNLCK;
717 if ((error = lf_clearlock(lock)) != 0) {
718 zfree(KT_LOCKF, lock);
719 return error;
720 }
721 lock->lf_type = F_WRLCK;
722 }
723 /*
724 * Add our lock to the blocked list and sleep until we're free.
725 * Remember who blocked us (for deadlock detection).
726 */
727 lock->lf_next = block;
728 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
729
730 if (!(lock->lf_flags & F_FLOCK)) {
731 block->lf_flags &= ~F_WAKE1_SAFE;
732 }
733
734 #if IMPORTANCE_INHERITANCE
735 /*
736 * Importance donation is done only for cases where the
737 * owning task can be unambiguously determined.
738 *
739 * POSIX type locks are not inherited by child processes;
740 * we maintain a 1:1 mapping between a lock and its owning
741 * process.
742 *
743 * Flock type locks are inherited across fork() and there is
744 * no 1:1 mapping in the general case. However, the fileglobs
745 * used by OFD locks *may* be confined to the process that
746 * created them, and thus have an "owner", in which case
747 * we also attempt importance donation.
748 */
749 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0) {
750 lf_boost_blocking_proc(lock, block);
751 } else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
752 lock->lf_owner != block->lf_owner &&
753 NULL != lock->lf_owner && NULL != block->lf_owner) {
754 lf_boost_blocking_proc(lock, block);
755 }
756 #endif /* IMPORTANCE_INHERITANCE */
757
758 #ifdef LOCKF_DEBUGGING
759 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
760 lf_print("lf_setlock: blocking on", block);
761 lf_printlist("lf_setlock(block)", block);
762 }
763 #endif /* LOCKF_DEBUGGING */
764 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
765
766 if (lock->lf_flags & F_POSIX) {
767 error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
768 /*
769 * Ensure that 'lock' doesn't get mutated or freed if a
770 * wakeup occurs while hunting for deadlocks (and holding
771 * lf_dead_lock - see above)
772 */
773 lck_mtx_lock(&lf_dead_lock);
774 lck_mtx_unlock(&lf_dead_lock);
775 } else {
776 static const char lockstr_np[] = "lockf:np";
777 error = msleep(lock, &vp->v_lock, priority, lockstr_np, timeout);
778 }
779
780 if (error == 0 && (lock->lf_flags & F_ABORT) != 0) {
781 error = EBADF;
782 }
783
784 if (lock->lf_next) {
785 /*
786 * lf_wakelock() always sets wakelock->lf_next to
787 * NULL before a wakeup; so we've been woken early
788 * - perhaps by a debugger, signal or other event.
789 *
790 * Remove 'lock' from the block list (avoids double-add
791 * in the spurious case, which would create a cycle)
792 */
793 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
794 #if IMPORTANCE_INHERITANCE
795 /*
796 * Adjust the boost on lf_next.
797 */
798 lf_adjust_assertion(lock->lf_next);
799 #endif /* IMPORTANCE_INHERITANCE */
800 lock->lf_next = NULL;
801
802 if (error == 0) {
803 /*
804 * If this was a spurious wakeup, retry
805 */
806 printf("%s: spurious wakeup, retrying lock\n",
807 __func__);
808 continue;
809 }
810 }
811
812 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
813 if ((block = lf_getblock(lock, -1)) != NULL) {
814 lf_move_blocked(block, lock);
815 }
816 }
817
818 if (error) {
819 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
820 lf_wakelock(lock, TRUE);
821 }
822 zfree(KT_LOCKF, lock);
823 /* Return ETIMEDOUT if timeout occoured. */
824 if (error == EWOULDBLOCK) {
825 error = ETIMEDOUT;
826 }
827 return error;
828 }
829 }
830
831 /*
832 * No blocks!! Add the lock. Note that we will
833 * downgrade or upgrade any overlapping locks this
834 * process already owns.
835 *
836 * Skip over locks owned by other processes.
837 * Handle any locks that overlap and are owned by ourselves.
838 */
839 prev = head;
840 block = *head;
841 needtolink = 1;
842 for (;;) {
843 const off_t lkend = LF_END(lock);
844 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
845 if (ovcase) {
846 block = overlap->lf_next;
847 }
848 /*
849 * Six cases:
850 * 0) no overlap
851 * 1) overlap == lock
852 * 2) overlap contains lock
853 * 3) lock contains overlap
854 * 4) overlap starts before lock
855 * 5) overlap ends after lock
856 */
857 switch (ovcase) {
858 case OVERLAP_NONE:
859 if (needtolink) {
860 *prev = lock;
861 lock->lf_next = overlap;
862 }
863 break;
864
865 case OVERLAP_EQUALS_LOCK:
866 /*
867 * If downgrading lock, others may be
868 * able to acquire it.
869 */
870 if (lock->lf_type == F_RDLCK &&
871 overlap->lf_type == F_WRLCK) {
872 lf_wakelock(overlap, TRUE);
873 }
874 overlap->lf_type = lock->lf_type;
875 lf_move_blocked(overlap, lock);
876 zfree(KT_LOCKF, lock);
877 lock = overlap; /* for lf_coalesce_adjacent() */
878 break;
879
880 case OVERLAP_CONTAINS_LOCK:
881 /*
882 * Check for common starting point and different types.
883 */
884 if (overlap->lf_type == lock->lf_type) {
885 lf_move_blocked(overlap, lock);
886 zfree(KT_LOCKF, lock);
887 lock = overlap; /* for lf_coalesce_adjacent() */
888 break;
889 }
890 if (overlap->lf_start == lock->lf_start) {
891 *prev = lock;
892 lock->lf_next = overlap;
893 assert(lkend < OFF_MAX);
894 overlap->lf_start = lkend + 1;
895 } else {
896 /*
897 * If we can't split the lock, we can't
898 * grant it. Claim a system limit for the
899 * resource shortage.
900 */
901 if (lf_split(overlap, lock)) {
902 zfree(KT_LOCKF, lock);
903 return ENOLCK;
904 }
905 }
906 lf_wakelock(overlap, TRUE);
907 break;
908
909 case OVERLAP_CONTAINED_BY_LOCK:
910 /*
911 * If downgrading lock, others may be able to
912 * acquire it, otherwise take the list.
913 */
914 if (lock->lf_type == F_RDLCK &&
915 overlap->lf_type == F_WRLCK) {
916 lf_wakelock(overlap, TRUE);
917 } else {
918 lf_move_blocked(lock, overlap);
919 }
920 /*
921 * Add the new lock if necessary and delete the overlap.
922 */
923 if (needtolink) {
924 *prev = lock;
925 lock->lf_next = overlap->lf_next;
926 prev = &lock->lf_next;
927 needtolink = 0;
928 } else {
929 *prev = overlap->lf_next;
930 }
931 zfree(KT_LOCKF, overlap);
932 continue;
933
934 case OVERLAP_STARTS_BEFORE_LOCK:
935 /*
936 * Add lock after overlap on the list.
937 */
938 lock->lf_next = overlap->lf_next;
939 overlap->lf_next = lock;
940 assert(lock->lf_start > 0);
941 overlap->lf_end = lock->lf_start - 1;
942 prev = &lock->lf_next;
943 lf_wakelock(overlap, TRUE);
944 needtolink = 0;
945 continue;
946
947 case OVERLAP_ENDS_AFTER_LOCK:
948 /*
949 * Add the new lock before overlap.
950 */
951 if (needtolink) {
952 *prev = lock;
953 lock->lf_next = overlap;
954 }
955 assert(lkend < OFF_MAX);
956 overlap->lf_start = lkend + 1;
957 lf_wakelock(overlap, TRUE);
958 break;
959 }
960 break;
961 }
962 /* Coalesce adjacent locks with identical attributes */
963 lf_coalesce_adjacent(lock);
964 #ifdef LOCKF_DEBUGGING
965 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
966 lf_print("lf_setlock: got the lock", lock);
967 lf_printlist("lf_setlock(out)", lock);
968 }
969 #endif /* LOCKF_DEBUGGING */
970 return 0;
971 }
972
973
974 /*
975 * lf_clearlock
976 *
977 * Description: Remove a byte-range lock on an vnode. Generally, find the
978 * lock (or an overlap to that lock) and remove it (or shrink
979 * it), then wakeup anyone we can.
980 *
981 * Parameters: unlock The lock to clear
982 *
983 * Returns: 0 Success
984 * lf_split:ENOLCK
985 *
986 * Notes: A caller may unlock all the locks owned by the caller by
987 * specifying the entire file range; locks owned by other
988 * callers are not effected by this operation.
989 */
990 static int
lf_clearlock(struct lockf * unlock)991 lf_clearlock(struct lockf *unlock)
992 {
993 struct lockf **head = unlock->lf_head;
994 struct lockf *lf = *head;
995 struct lockf *overlap, **prev;
996 overlap_t ovcase;
997
998 if (lf == NOLOCKF) {
999 return 0;
1000 }
1001 #ifdef LOCKF_DEBUGGING
1002 if (unlock->lf_type != F_UNLCK) {
1003 panic("lf_clearlock: bad type");
1004 }
1005 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1006 lf_print("lf_clearlock", unlock);
1007 }
1008 #endif /* LOCKF_DEBUGGING */
1009 prev = head;
1010 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
1011 const off_t unlkend = LF_END(unlock);
1012 /*
1013 * Wakeup the list of locks to be retried.
1014 */
1015 lf_wakelock(overlap, FALSE);
1016 #if IMPORTANCE_INHERITANCE
1017 if (overlap->lf_boosted == LF_BOOSTED) {
1018 lf_drop_assertion(overlap);
1019 }
1020 #endif /* IMPORTANCE_INHERITANCE */
1021
1022 switch (ovcase) {
1023 case OVERLAP_NONE: /* satisfy compiler enum/switch */
1024 break;
1025
1026 case OVERLAP_EQUALS_LOCK:
1027 *prev = overlap->lf_next;
1028 zfree(KT_LOCKF, overlap);
1029 break;
1030
1031 case OVERLAP_CONTAINS_LOCK: /* split it */
1032 if (overlap->lf_start == unlock->lf_start) {
1033 assert(unlkend < OFF_MAX);
1034 overlap->lf_start = unlkend + 1;
1035 break;
1036 }
1037 /*
1038 * If we can't split the lock, we can't grant it.
1039 * Claim a system limit for the resource shortage.
1040 */
1041 if (lf_split(overlap, unlock)) {
1042 return ENOLCK;
1043 }
1044 overlap->lf_next = unlock->lf_next;
1045 break;
1046
1047 case OVERLAP_CONTAINED_BY_LOCK:
1048 *prev = overlap->lf_next;
1049 lf = overlap->lf_next;
1050 zfree(KT_LOCKF, overlap);
1051 continue;
1052
1053 case OVERLAP_STARTS_BEFORE_LOCK:
1054 assert(unlock->lf_start > 0);
1055 overlap->lf_end = unlock->lf_start - 1;
1056 prev = &overlap->lf_next;
1057 lf = overlap->lf_next;
1058 continue;
1059
1060 case OVERLAP_ENDS_AFTER_LOCK:
1061 assert(unlkend < OFF_MAX);
1062 overlap->lf_start = unlkend + 1;
1063 break;
1064 }
1065 break;
1066 }
1067 #ifdef LOCKF_DEBUGGING
1068 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1069 lf_printlist("lf_clearlock", unlock);
1070 }
1071 #endif /* LOCKF_DEBUGGING */
1072 return 0;
1073 }
1074
1075
1076 /*
1077 * lf_transferlock
1078 *
1079 * Description: Transfer a give lock from old_proc to new proc during exec
1080 *
1081 * Parameters: unlock The lock to transfer
1082 *
1083 * Returns: 0 Success
1084 *
1085 * Notes: A caller may transfer all the locks owned by the caller by
1086 * specifying the entire file range; locks owned by other
1087 * callers are not effected by this operation.
1088 */
1089 static int
lf_transferlock(struct lockf * transfer)1090 lf_transferlock(struct lockf *transfer)
1091 {
1092 struct lockf **head = transfer->lf_head;
1093 struct lockf *lf = *head;
1094 struct lockf *overlap, **prev;
1095 overlap_t ovcase;
1096
1097 if (lf == NOLOCKF) {
1098 return 0;
1099 }
1100 #ifdef LOCKF_DEBUGGING
1101 if (transfer->lf_type != F_TRANSFER) {
1102 panic("lf_transferlock: bad type");
1103 }
1104 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1105 lf_print("lf_transferlock", transfer);
1106 }
1107 #endif /* LOCKF_DEBUGGING */
1108 prev = head;
1109 while ((ovcase = lf_findoverlap(lf, transfer, SELF, &prev, &overlap)) != OVERLAP_NONE) {
1110 /* For POSIX Locks, change lf_id and lf_owner */
1111 if (overlap->lf_flags & F_POSIX) {
1112 overlap->lf_id = (caddr_t)transfer->lf_owner;
1113 overlap->lf_owner = transfer->lf_owner;
1114 } else if (overlap->lf_flags & F_OFD_LOCK) {
1115 /* Change the owner of the ofd style lock, if there is an owner */
1116 if (overlap->lf_owner) {
1117 overlap->lf_owner = transfer->lf_owner;
1118 }
1119 }
1120 /* Find the next lock */
1121 lf = overlap->lf_next;
1122 }
1123 #ifdef LOCKF_DEBUGGING
1124 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1125 lf_printlist("lf_transferlock", transfer);
1126 }
1127 #endif /* LOCKF_DEBUGGING */
1128 return 0;
1129 }
1130
1131
1132 /*
1133 * lf_getlock
1134 *
1135 * Description: Check whether there is a blocking lock, and if so return
1136 * its process identifier into the lock being requested.
1137 *
1138 * Parameters: lock Pointer to lock to test for blocks
1139 * fl Pointer to flock structure to receive
1140 * the blocking lock information, if a
1141 * blocking lock is found.
1142 * matchpid -1, or pid value to match in lookup.
1143 *
1144 * Returns: 0 Success
1145 *
1146 * Implicit Returns:
1147 * *fl Contents modified to reflect the
1148 * blocking lock, if one is found; not
1149 * modified otherwise
1150 *
1151 * Notes: fl->l_pid will be (-1) for file locks and will only be set to
1152 * the blocking process ID for advisory record locks.
1153 */
1154 static int
lf_getlock(struct lockf * lock,struct flock * fl,pid_t matchpid)1155 lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
1156 {
1157 struct lockf *block;
1158
1159 #ifdef LOCKF_DEBUGGING
1160 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1161 lf_print("lf_getlock", lock);
1162 }
1163 #endif /* LOCKF_DEBUGGING */
1164
1165 if ((block = lf_getblock(lock, matchpid))) {
1166 fl->l_type = block->lf_type;
1167 fl->l_whence = SEEK_SET;
1168 fl->l_start = block->lf_start;
1169 if (block->lf_end == -1 ||
1170 (block->lf_start == 0 && LF_END(block) == OFF_MAX)) {
1171 fl->l_len = 0;
1172 } else {
1173 fl->l_len = LF_END(block) - block->lf_start + 1;
1174 }
1175 if (NULL != block->lf_owner) {
1176 /*
1177 * lf_owner is only non-NULL when the lock
1178 * "owner" can be unambiguously determined
1179 */
1180 fl->l_pid = proc_pid(block->lf_owner);
1181 } else {
1182 fl->l_pid = -1;
1183 }
1184 } else {
1185 fl->l_type = F_UNLCK;
1186 }
1187 return 0;
1188 }
1189
1190 /*
1191 * lf_getblock
1192 *
1193 * Description: Walk the list of locks for an inode and return the first
1194 * blocking lock. A lock is considered blocking if we are not
1195 * the lock owner; otherwise, we are permitted to upgrade or
1196 * downgrade it, and it's not considered blocking.
1197 *
1198 * Parameters: lock The lock for which we are interested
1199 * in obtaining the blocking lock, if any
1200 * matchpid -1, or pid value to match in lookup.
1201 *
1202 * Returns: NOLOCKF No blocking lock exists
1203 * !NOLOCKF The address of the blocking lock's
1204 * struct lockf.
1205 */
1206 static struct lockf *
lf_getblock(struct lockf * lock,pid_t matchpid)1207 lf_getblock(struct lockf *lock, pid_t matchpid)
1208 {
1209 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1210
1211 for (prev = lock->lf_head;
1212 lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1213 lf = overlap->lf_next) {
1214 /*
1215 * Found an overlap.
1216 *
1217 * If we're matching pids, and it's a record lock,
1218 * or it's an OFD lock on a process-confined fd,
1219 * but the pid doesn't match, then keep on looking ..
1220 */
1221 if (matchpid != -1 &&
1222 (overlap->lf_flags & (F_POSIX | F_OFD_LOCK)) != 0 &&
1223 proc_pid(overlap->lf_owner) != matchpid) {
1224 continue;
1225 }
1226
1227 /*
1228 * does it block us?
1229 */
1230 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) {
1231 return overlap;
1232 }
1233 }
1234 return NOLOCKF;
1235 }
1236
1237
1238 /*
1239 * lf_findoverlap
1240 *
1241 * Description: Walk the list of locks to find an overlapping lock (if any).
1242 *
1243 * Parameters: lf First lock on lock list
1244 * lock The lock we are checking for an overlap
1245 * check Check type
1246 * prev pointer to pointer pointer to contain
1247 * address of pointer to previous lock
1248 * pointer to overlapping lock, if overlap
1249 * overlap pointer to pointer to contain address
1250 * of overlapping lock
1251 *
1252 * Returns: OVERLAP_NONE
1253 * OVERLAP_EQUALS_LOCK
1254 * OVERLAP_CONTAINS_LOCK
1255 * OVERLAP_CONTAINED_BY_LOCK
1256 * OVERLAP_STARTS_BEFORE_LOCK
1257 * OVERLAP_ENDS_AFTER_LOCK
1258 *
1259 * Implicit Returns:
1260 * *prev The address of the next pointer in the
1261 * lock previous to the overlapping lock;
1262 * this is generally used to relink the
1263 * lock list, avoiding a second iteration.
1264 * *overlap The pointer to the overlapping lock
1265 * itself; this is used to return data in
1266 * the check == OTHERS case, and for the
1267 * caller to modify the overlapping lock,
1268 * in the check == SELF case
1269 *
1270 * Note: This returns only the FIRST overlapping lock. There may be
1271 * more than one. lf_getlock will return the first blocking lock,
1272 * while lf_setlock will iterate over all overlapping locks to
1273 *
1274 * The check parameter can be SELF, meaning we are looking for
1275 * overlapping locks owned by us, or it can be OTHERS, meaning
1276 * we are looking for overlapping locks owned by someone else so
1277 * we can report a blocking lock on an F_GETLK request.
1278 *
1279 * The value of *overlap and *prev are modified, even if there is
1280 * no overlapping lock found; always check the return code.
1281 */
1282 static overlap_t
lf_findoverlap(struct lockf * lf,struct lockf * lock,int type,struct lockf *** prev,struct lockf ** overlap)1283 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1284 struct lockf ***prev, struct lockf **overlap)
1285 {
1286 int found_self = 0;
1287
1288 *overlap = lf;
1289 if (lf == NOLOCKF) {
1290 return 0;
1291 }
1292 #ifdef LOCKF_DEBUGGING
1293 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1294 lf_print("lf_findoverlap: looking for overlap in", lock);
1295 }
1296 #endif /* LOCKF_DEBUGGING */
1297 const off_t start = lock->lf_start;
1298 const off_t end = LF_END(lock);
1299 while (lf != NOLOCKF) {
1300 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1301 ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1302 /*
1303 * Locks belonging to one process are adjacent on the
1304 * list, so if we've found any locks belonging to us,
1305 * and we're now seeing something else, then we've
1306 * examined all "self" locks. Note that bailing out
1307 * here is quite important; for coalescing, we assume
1308 * numerically adjacent locks from the same owner to
1309 * be adjacent on the list.
1310 */
1311 if ((type & SELF) && found_self) {
1312 return OVERLAP_NONE;
1313 }
1314
1315 *prev = &lf->lf_next;
1316 *overlap = lf = lf->lf_next;
1317 continue;
1318 }
1319
1320 if ((type & SELF)) {
1321 found_self = 1;
1322 }
1323
1324 #ifdef LOCKF_DEBUGGING
1325 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1326 lf_print("\tchecking", lf);
1327 }
1328 #endif /* LOCKF_DEBUGGING */
1329 /*
1330 * OK, check for overlap
1331 */
1332 const off_t lfstart = lf->lf_start;
1333 const off_t lfend = LF_END(lf);
1334
1335 if ((start > lfend) || (lfstart > end)) {
1336 /* Case 0 */
1337 LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1338
1339 /*
1340 * NOTE: assumes that locks for the same process are
1341 * nonintersecting and ordered.
1342 */
1343 if ((type & SELF) && lfstart > end) {
1344 return OVERLAP_NONE;
1345 }
1346 *prev = &lf->lf_next;
1347 *overlap = lf = lf->lf_next;
1348 continue;
1349 }
1350 if ((lfstart == start) && (lfend == end)) {
1351 LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1352 return OVERLAP_EQUALS_LOCK;
1353 }
1354 if ((lfstart <= start) && (lfend >= end)) {
1355 LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1356 return OVERLAP_CONTAINS_LOCK;
1357 }
1358 if ((start <= lfstart) && (end >= lfend)) {
1359 LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1360 return OVERLAP_CONTAINED_BY_LOCK;
1361 }
1362 if ((lfstart < start) && (lfend >= start)) {
1363 LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1364 return OVERLAP_STARTS_BEFORE_LOCK;
1365 }
1366 if ((lfstart > start) && (lfend > end)) {
1367 LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1368 return OVERLAP_ENDS_AFTER_LOCK;
1369 }
1370 panic("lf_findoverlap: default");
1371 }
1372 return OVERLAP_NONE;
1373 }
1374
1375
1376 /*
1377 * lf_split
1378 *
1379 * Description: Split a lock and a contained region into two or three locks
1380 * as necessary.
1381 *
1382 * Parameters: lock1 Lock to split
1383 * lock2 Overlapping lock region requiring the
1384 * split (upgrade/downgrade/unlock)
1385 *
1386 * Returns: 0 Success
1387 * ENOLCK No memory for new lock
1388 *
1389 * Implicit Returns:
1390 * *lock1 Modified original lock
1391 * *lock2 Overlapping lock (inserted into list)
1392 * (new lock) Potential new lock inserted into list
1393 * if split results in 3 locks
1394 *
1395 * Notes: This operation can only fail if the split would result in three
1396 * locks, and there is insufficient memory to allocate the third
1397 * lock; in that case, neither of the locks will be modified.
1398 */
1399 static int
lf_split(struct lockf * lock1,struct lockf * lock2)1400 lf_split(struct lockf *lock1, struct lockf *lock2)
1401 {
1402 struct lockf *splitlock;
1403
1404 #ifdef LOCKF_DEBUGGING
1405 if (LOCKF_DEBUGP(LF_DBG_LIST)) {
1406 lf_print("lf_split", lock1);
1407 lf_print("splitting from", lock2);
1408 }
1409 #endif /* LOCKF_DEBUGGING */
1410 /*
1411 * Check to see if splitting into only two pieces.
1412 */
1413 if (lock1->lf_start == lock2->lf_start) {
1414 assert(LF_END(lock2) < OFF_MAX);
1415 lock1->lf_start = LF_END(lock2) + 1;
1416 lock2->lf_next = lock1;
1417 return 0;
1418 }
1419 if (LF_END(lock1) == LF_END(lock2)) {
1420 assert(lock2->lf_start > 0);
1421 lock1->lf_end = lock2->lf_start - 1;
1422 lock2->lf_next = lock1->lf_next;
1423 lock1->lf_next = lock2;
1424 return 0;
1425 }
1426 /*
1427 * Make a new lock consisting of the last part of
1428 * the encompassing lock
1429 */
1430 splitlock = zalloc_flags(KT_LOCKF, Z_WAITOK | Z_NOFAIL);
1431 bcopy(lock1, splitlock, sizeof *splitlock);
1432 assert(LF_END(lock2) < OFF_MAX);
1433 splitlock->lf_start = LF_END(lock2) + 1;
1434 TAILQ_INIT(&splitlock->lf_blkhd);
1435 assert(lock2->lf_start > 0);
1436 lock1->lf_end = lock2->lf_start - 1;
1437 /*
1438 * OK, now link it in
1439 */
1440 splitlock->lf_next = lock1->lf_next;
1441 lock2->lf_next = splitlock;
1442 lock1->lf_next = lock2;
1443
1444 return 0;
1445 }
1446
1447
1448 /*
1449 * lf_wakelock
1450 *
1451 * Wakeup a blocklist in the case of a downgrade or unlock, since others
1452 * waiting on the lock may now be able to acquire it.
1453 *
1454 * Parameters: listhead Lock list head on which waiters may
1455 * have pending locks
1456 *
1457 * Returns: <void>
1458 *
1459 * Notes: This function iterates a list of locks and wakes all waiters,
1460 * rather than only waiters for the contended regions. Because
1461 * of this, for heavily contended files, this can result in a
1462 * "thundering herd" situation. Refactoring the code could make
1463 * this operation more efficient, if heavy contention ever results
1464 * in a real-world performance problem.
1465 */
1466 static void
lf_wakelock(struct lockf * listhead,boolean_t force_all)1467 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1468 {
1469 struct lockf *wakelock;
1470 boolean_t wake_all = TRUE;
1471
1472 if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) {
1473 wake_all = FALSE;
1474 }
1475
1476 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1477 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1478 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1479
1480 wakelock->lf_next = NOLOCKF;
1481 #ifdef LOCKF_DEBUGGING
1482 if (LOCKF_DEBUGP(LF_DBG_LOCKOP)) {
1483 lf_print("lf_wakelock: awakening", wakelock);
1484 }
1485 #endif /* LOCKF_DEBUGGING */
1486 if (wake_all == FALSE) {
1487 /*
1488 * If there are items on the list head block list,
1489 * move them to the wakelock list instead, and then
1490 * correct their lf_next pointers.
1491 */
1492 if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1493 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1494
1495 struct lockf *tlock;
1496
1497 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1498 if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1499 /* See rdar://10887303 */
1500 panic("cycle in wakelock list");
1501 }
1502 tlock->lf_next = wakelock;
1503 }
1504 }
1505 }
1506 wakeup(wakelock);
1507
1508 if (wake_all == FALSE) {
1509 break;
1510 }
1511 }
1512 }
1513
1514
1515 #ifdef LOCKF_DEBUGGING
1516 #define GET_LF_OWNER_PID(lf) (proc_pid((lf)->lf_owner))
1517
1518 /*
1519 * lf_print DEBUG
1520 *
1521 * Print out a lock; lock information is prefixed by the string in 'tag'
1522 *
1523 * Parameters: tag A string tag for debugging
1524 * lock The lock whose information should be
1525 * displayed
1526 *
1527 * Returns: <void>
1528 */
1529 void
lf_print(const char * tag,struct lockf * lock)1530 lf_print(const char *tag, struct lockf *lock)
1531 {
1532 printf("%s: lock %p for ", tag, (void *)lock);
1533 if (lock->lf_flags & F_POSIX) {
1534 printf("proc %p (owner %d)",
1535 lock->lf_id, GET_LF_OWNER_PID(lock));
1536 } else if (lock->lf_flags & F_OFD_LOCK) {
1537 printf("fg %p (owner %d)",
1538 lock->lf_id, GET_LF_OWNER_PID(lock));
1539 } else {
1540 printf("id %p", (void *)lock->lf_id);
1541 }
1542 if (lock->lf_vnode != 0) {
1543 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1544 lock->lf_vnode,
1545 lock->lf_type == F_RDLCK ? "shared" :
1546 lock->lf_type == F_WRLCK ? "exclusive" :
1547 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1548 (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1549 } else {
1550 printf(" %s, start 0x%016llx, end 0x%016llx",
1551 lock->lf_type == F_RDLCK ? "shared" :
1552 lock->lf_type == F_WRLCK ? "exclusive" :
1553 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1554 (uint64_t)lock->lf_start, (uint64_t)lock->lf_end);
1555 }
1556 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
1557 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1558 } else {
1559 printf("\n");
1560 }
1561 }
1562
1563
1564 /*
1565 * lf_printlist DEBUG
1566 *
1567 * Print out a lock list for the vnode associated with 'lock'; lock information
1568 * is prefixed by the string in 'tag'
1569 *
1570 * Parameters: tag A string tag for debugging
1571 * lock The lock whose vnode's lock list should
1572 * be displayed
1573 *
1574 * Returns: <void>
1575 */
1576 void
lf_printlist(const char * tag,struct lockf * lock)1577 lf_printlist(const char *tag, struct lockf *lock)
1578 {
1579 struct lockf *lf, *blk;
1580
1581 if (lock->lf_vnode == 0) {
1582 return;
1583 }
1584
1585 printf("%s: Lock list for vno %p:\n",
1586 tag, lock->lf_vnode);
1587 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1588 printf("\tlock %p for ", (void *)lf);
1589 if (lf->lf_flags & F_POSIX) {
1590 printf("proc %p (owner %d)",
1591 lf->lf_id, GET_LF_OWNER_PID(lf));
1592 } else if (lf->lf_flags & F_OFD_LOCK) {
1593 printf("fg %p (owner %d)",
1594 lf->lf_id, GET_LF_OWNER_PID(lf));
1595 } else {
1596 printf("id %p", (void *)lf->lf_id);
1597 }
1598 printf(", %s, start 0x%016llx, end 0x%016llx",
1599 lf->lf_type == F_RDLCK ? "shared" :
1600 lf->lf_type == F_WRLCK ? "exclusive" :
1601 lf->lf_type == F_UNLCK ? "unlock" :
1602 "unknown", (uint64_t)lf->lf_start, (uint64_t)lf->lf_end);
1603 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1604 printf("\n\t\tlock request %p for ", (void *)blk);
1605 if (blk->lf_flags & F_POSIX) {
1606 printf("proc %p (owner %d)",
1607 blk->lf_id, GET_LF_OWNER_PID(blk));
1608 } else if (blk->lf_flags & F_OFD_LOCK) {
1609 printf("fg %p (owner %d)",
1610 blk->lf_id, GET_LF_OWNER_PID(blk));
1611 } else {
1612 printf("id %p", (void *)blk->lf_id);
1613 }
1614 printf(", %s, start 0x%016llx, end 0x%016llx",
1615 blk->lf_type == F_RDLCK ? "shared" :
1616 blk->lf_type == F_WRLCK ? "exclusive" :
1617 blk->lf_type == F_UNLCK ? "unlock" :
1618 "unknown", (uint64_t)blk->lf_start,
1619 (uint64_t)blk->lf_end);
1620 if (!TAILQ_EMPTY(&blk->lf_blkhd)) {
1621 panic("lf_printlist: bad list");
1622 }
1623 }
1624 printf("\n");
1625 }
1626 }
1627 #endif /* LOCKF_DEBUGGING */
1628
1629 #if IMPORTANCE_INHERITANCE
1630
1631 /*
1632 * lf_hold_assertion
1633 *
1634 * Call task importance hold assertion on the owner of the lock.
1635 *
1636 * Parameters: block_task Owner of the lock blocking
1637 * current thread.
1638 *
1639 * block lock on which the current thread
1640 * is blocking on.
1641 *
1642 * Returns: <void>
1643 *
1644 * Notes: The task reference on block_task is not needed to be hold since
1645 * the current thread has vnode lock and block_task has a file
1646 * lock, thus removing file lock in exit requires block_task to
1647 * grab the vnode lock.
1648 */
1649 static void
lf_hold_assertion(task_t block_task,struct lockf * block)1650 lf_hold_assertion(task_t block_task, struct lockf *block)
1651 {
1652 if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) {
1653 block->lf_boosted = LF_BOOSTED;
1654 LOCKF_DEBUG(LF_DBG_IMPINH,
1655 "lf: importance hold file lock assert on pid %d lock %p\n",
1656 proc_pid(block->lf_owner), block);
1657 }
1658 }
1659
1660
1661 /*
1662 * lf_jump_to_queue_head
1663 *
1664 * Jump the lock from the tail of the block queue to the head of
1665 * the queue.
1666 *
1667 * Parameters: block lockf struct containing the
1668 * block queue.
1669 * lock lockf struct to be jumped to the
1670 * front.
1671 *
1672 * Returns: <void>
1673 */
1674 static void
lf_jump_to_queue_head(struct lockf * block,struct lockf * lock)1675 lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1676 {
1677 /* Move the lock to the head of the block queue. */
1678 TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1679 TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1680 }
1681
1682
1683 /*
1684 * lf_drop_assertion
1685 *
1686 * Drops the task hold assertion.
1687 *
1688 * Parameters: block lockf struct holding the assertion.
1689 *
1690 * Returns: <void>
1691 */
1692 static void
lf_drop_assertion(struct lockf * block)1693 lf_drop_assertion(struct lockf *block)
1694 {
1695 LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1696 proc_pid(block->lf_owner), block);
1697
1698 task_t current_task = proc_task(block->lf_owner);
1699 task_importance_drop_file_lock_assertion(current_task, 1);
1700 block->lf_boosted = LF_NOT_BOOSTED;
1701 }
1702
1703 /*
1704 * lf_adjust_assertion
1705 *
1706 * Adjusts importance assertion of file lock. Goes through
1707 * all the blocking locks and checks if the file lock needs
1708 * to be boosted anymore.
1709 *
1710 * Parameters: block lockf structure which needs to be adjusted.
1711 *
1712 * Returns: <void>
1713 */
1714 static void
lf_adjust_assertion(struct lockf * block)1715 lf_adjust_assertion(struct lockf *block)
1716 {
1717 boolean_t drop_boost = TRUE;
1718 struct lockf *next;
1719
1720 /* Return if the lock is not boosted */
1721 if (block->lf_boosted == LF_NOT_BOOSTED) {
1722 return;
1723 }
1724
1725 TAILQ_FOREACH(next, &block->lf_blkhd, lf_block) {
1726 /* Check if block and next are same type of locks */
1727 if (((block->lf_flags & next->lf_flags & F_POSIX) != 0) ||
1728 ((block->lf_flags & next->lf_flags & F_OFD_LOCK) &&
1729 (block->lf_owner != next->lf_owner) &&
1730 (NULL != block->lf_owner && NULL != next->lf_owner))) {
1731 /* Check if next would be boosting block */
1732 if (task_is_importance_donor(proc_task(next->lf_owner)) &&
1733 task_is_importance_receiver_type(proc_task(block->lf_owner))) {
1734 /* Found a lock boosting block */
1735 drop_boost = FALSE;
1736 break;
1737 }
1738 }
1739 }
1740
1741 if (drop_boost) {
1742 lf_drop_assertion(block);
1743 }
1744 }
1745
1746 static void
lf_boost_blocking_proc(struct lockf * lock,struct lockf * block)1747 lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1748 {
1749 task_t ltask = proc_task(lock->lf_owner);
1750 task_t btask = proc_task(block->lf_owner);
1751
1752 /*
1753 * Check if ltask can donate importance. The
1754 * check of imp_donor bit is done without holding
1755 * any lock. The value may change after you read it,
1756 * but it is ok to boost a task while someone else is
1757 * unboosting you.
1758 *
1759 * TODO: Support live inheritance on file locks.
1760 */
1761 if (task_is_importance_donor(ltask)) {
1762 LOCKF_DEBUG(LF_DBG_IMPINH,
1763 "lf: %d: attempt to boost pid %d that holds lock %p\n",
1764 proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1765
1766 if (block->lf_boosted != LF_BOOSTED &&
1767 task_is_importance_receiver_type(btask)) {
1768 lf_hold_assertion(btask, block);
1769 }
1770 lf_jump_to_queue_head(block, lock);
1771 }
1772 }
1773 #endif /* IMPORTANCE_INHERITANCE */
1774