futex: update documentation for ordering guarantees
[linux.git] / fs / locks.c
index 92a0f0a52b06522e69b9452693a3f7f52cef5b40..13fc7a6d380ae6648945c8956cc53901de2d0ccc 100644 (file)
 #define IS_POSIX(fl)   (fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)   (fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)   (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_FILE_PVT(fl)        (fl->fl_flags & FL_FILE_PVT)
 
 static bool lease_breaking(struct file_lock *fl)
 {
@@ -344,48 +345,43 @@ static int assign_type(struct file_lock *fl, long type)
        return 0;
 }
 
-/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
- * style lock.
- */
-static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
-                              struct flock *l)
+static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
+                                struct flock64 *l)
 {
-       off_t start, end;
-
        switch (l->l_whence) {
        case SEEK_SET:
-               start = 0;
+               fl->fl_start = 0;
                break;
        case SEEK_CUR:
-               start = filp->f_pos;
+               fl->fl_start = filp->f_pos;
                break;
        case SEEK_END:
-               start = i_size_read(file_inode(filp));
+               fl->fl_start = i_size_read(file_inode(filp));
                break;
        default:
                return -EINVAL;
        }
+       if (l->l_start > OFFSET_MAX - fl->fl_start)
+               return -EOVERFLOW;
+       fl->fl_start += l->l_start;
+       if (fl->fl_start < 0)
+               return -EINVAL;
 
        /* POSIX-1996 leaves the case l->l_len < 0 undefined;
           POSIX-2001 defines it. */
-       start += l->l_start;
-       if (start < 0)
-               return -EINVAL;
-       fl->fl_end = OFFSET_MAX;
        if (l->l_len > 0) {
-               end = start + l->l_len - 1;
-               fl->fl_end = end;
+               if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
+                       return -EOVERFLOW;
+               fl->fl_end = fl->fl_start + l->l_len - 1;
+
        } else if (l->l_len < 0) {
-               end = start - 1;
-               fl->fl_end = end;
-               start += l->l_len;
-               if (start < 0)
+               if (fl->fl_start + l->l_len < 0)
                        return -EINVAL;
-       }
-       fl->fl_start = start;   /* we record the absolute position */
-       if (fl->fl_end < fl->fl_start)
-               return -EOVERFLOW;
-       
+               fl->fl_end = fl->fl_start - 1;
+               fl->fl_start += l->l_len;
+       } else
+               fl->fl_end = OFFSET_MAX;
+
        fl->fl_owner = current->files;
        fl->fl_pid = current->tgid;
        fl->fl_file = filp;
@@ -393,55 +389,36 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;
 
-       return assign_type(fl, l->l_type);
-}
-
-#if BITS_PER_LONG == 32
-static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
-                                struct flock64 *l)
-{
-       loff_t start;
-
-       switch (l->l_whence) {
-       case SEEK_SET:
-               start = 0;
-               break;
-       case SEEK_CUR:
-               start = filp->f_pos;
+       /* Ensure that fl->fl_filp has compatible f_mode */
+       switch (l->l_type) {
+       case F_RDLCK:
+               if (!(filp->f_mode & FMODE_READ))
+                       return -EBADF;
                break;
-       case SEEK_END:
-               start = i_size_read(file_inode(filp));
+       case F_WRLCK:
+               if (!(filp->f_mode & FMODE_WRITE))
+                       return -EBADF;
                break;
-       default:
-               return -EINVAL;
        }
 
-       start += l->l_start;
-       if (start < 0)
-               return -EINVAL;
-       fl->fl_end = OFFSET_MAX;
-       if (l->l_len > 0) {
-               fl->fl_end = start + l->l_len - 1;
-       } else if (l->l_len < 0) {
-               fl->fl_end = start - 1;
-               start += l->l_len;
-               if (start < 0)
-                       return -EINVAL;
-       }
-       fl->fl_start = start;   /* we record the absolute position */
-       if (fl->fl_end < fl->fl_start)
-               return -EOVERFLOW;
-       
-       fl->fl_owner = current->files;
-       fl->fl_pid = current->tgid;
-       fl->fl_file = filp;
-       fl->fl_flags = FL_POSIX;
-       fl->fl_ops = NULL;
-       fl->fl_lmops = NULL;
-
        return assign_type(fl, l->l_type);
 }
-#endif
+
+/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
+ * style lock.
+ */
+static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
+                              struct flock *l)
+{
+       struct flock64 ll = {
+               .l_type = l->l_type,
+               .l_whence = l->l_whence,
+               .l_start = l->l_start,
+               .l_len = l->l_len,
+       };
+
+       return flock64_to_posix_lock(filp, fl, &ll);
+}
 
 /* default lease lock manager operations */
 static void lease_break_callback(struct file_lock *fl)
@@ -511,8 +488,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 }
 
 /* Must be called with the i_lock held! */
-static inline void
-locks_insert_global_locks(struct file_lock *fl)
+static void locks_insert_global_locks(struct file_lock *fl)
 {
        lg_local_lock(&file_lock_lglock);
        fl->fl_link_cpu = smp_processor_id();
@@ -521,8 +497,7 @@ locks_insert_global_locks(struct file_lock *fl)
 }
 
 /* Must be called with the i_lock held! */
-static inline void
-locks_delete_global_locks(struct file_lock *fl)
+static void locks_delete_global_locks(struct file_lock *fl)
 {
        /*
         * Avoid taking lock if already unhashed. This is safe since this check
@@ -544,14 +519,12 @@ posix_owner_key(struct file_lock *fl)
        return (unsigned long)fl->fl_owner;
 }
 
-static inline void
-locks_insert_global_blocked(struct file_lock *waiter)
+static void locks_insert_global_blocked(struct file_lock *waiter)
 {
        hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
 }
 
-static inline void
-locks_delete_global_blocked(struct file_lock *waiter)
+static void locks_delete_global_blocked(struct file_lock *waiter)
 {
        hash_del(&waiter->fl_link);
 }
@@ -581,7 +554,7 @@ static void locks_delete_block(struct file_lock *waiter)
  * it seems like the reasonable thing to do.
  *
  * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
- * list itself is protected by the file_lock_list, but by ensuring that the
+ * list itself is protected by the blocked_lock_lock, but by ensuring that the
  * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
  * in some cases when we see that the fl_block list is empty.
  */
@@ -591,7 +564,7 @@ static void __locks_insert_block(struct file_lock *blocker,
        BUG_ON(!list_empty(&waiter->fl_block));
        waiter->fl_next = blocker;
        list_add_tail(&waiter->fl_block, &blocker->fl_block);
-       if (IS_POSIX(blocker))
+       if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker))
                locks_insert_global_blocked(waiter);
 }
 
@@ -652,15 +625,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
        locks_insert_global_locks(fl);
 }
 
-/*
- * Delete a lock and then free it.
- * Wake up processes that are blocked waiting for this lock,
- * notify the FS that the lock has been cleared and
- * finally free the lock.
+/**
+ * locks_delete_lock - Delete a lock and then free it.
+ * @thisfl_p: pointer that points to the fl_next field of the previous
+ *           inode->i_flock list entry
+ *
+ * Unlink a lock from all lists and free the namespace reference, but don't
+ * free it yet. Wake up processes that are blocked waiting for this lock and
+ * notify the FS that the lock has been cleared.
  *
  * Must be called with the i_lock held!
  */
-static void locks_delete_lock(struct file_lock **thisfl_p)
+static void locks_unlink_lock(struct file_lock **thisfl_p)
 {
        struct file_lock *fl = *thisfl_p;
 
@@ -675,6 +651,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
        }
 
        locks_wake_up_blocks(fl);
+}
+
+/*
+ * Unlink a lock from all lists and free it.
+ *
+ * Must be called with i_lock held!
+ */
+static void locks_delete_lock(struct file_lock **thisfl_p)
+{
+       struct file_lock *fl = *thisfl_p;
+
+       locks_unlink_lock(thisfl_p);
        locks_free_lock(fl);
 }
 
@@ -769,8 +757,16 @@ EXPORT_SYMBOL(posix_test_lock);
  * Note: the above assumption may not be true when handling lock
  * requests from a broken NFS client. It may also fail in the presence
  * of tasks (such as posix threads) sharing the same open file table.
- *
  * To handle those cases, we just bail out after a few iterations.
+ *
+ * For FL_FILE_PVT locks, the owner is the filp, not the files_struct.
+ * Because the owner is not even nominally tied to a thread of
+ * execution, the deadlock detection below can't reasonably work well. Just
+ * skip it for those.
+ *
+ * In principle, we could do a more limited deadlock detection on FL_FILE_PVT
+ * locks that just checks for the case where two tasks are attempting to
+ * upgrade from read to write locks on the same inode.
  */
 
 #define MAX_DEADLK_ITERATIONS 10
@@ -793,6 +789,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
 {
        int i = 0;
 
+       /*
+        * This deadlock detector can't reasonably detect deadlocks with
+        * FL_FILE_PVT locks, since they aren't owned by a process, per-se.
+        */
+       if (IS_FILE_PVT(caller_fl))
+               return 0;
+
        while ((block_fl = what_owner_is_waiting_for(block_fl))) {
                if (i++ > MAX_DEADLK_ITERATIONS)
                        return 0;
@@ -1152,13 +1155,14 @@ EXPORT_SYMBOL(posix_lock_file_wait);
 
 /**
  * locks_mandatory_locked - Check for an active lock
- * @inode: the file to check
+ * @file: the file to check
  *
  * Searches the inode's list of locks to find any POSIX locks which conflict.
  * This function is called from locks_verify_locked() only.
  */
-int locks_mandatory_locked(struct inode *inode)
+int locks_mandatory_locked(struct file *file)
 {
+       struct inode *inode = file_inode(file);
        fl_owner_t owner = current->files;
        struct file_lock *fl;
 
@@ -1169,7 +1173,7 @@ int locks_mandatory_locked(struct inode *inode)
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!IS_POSIX(fl))
                        continue;
-               if (fl->fl_owner != owner)
+               if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file)
                        break;
        }
        spin_unlock(&inode->i_lock);
@@ -1195,19 +1199,30 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 {
        struct file_lock fl;
        int error;
+       bool sleep = false;
 
        locks_init_lock(&fl);
-       fl.fl_owner = current->files;
        fl.fl_pid = current->tgid;
        fl.fl_file = filp;
        fl.fl_flags = FL_POSIX | FL_ACCESS;
        if (filp && !(filp->f_flags & O_NONBLOCK))
-               fl.fl_flags |= FL_SLEEP;
+               sleep = true;
        fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
        fl.fl_start = offset;
        fl.fl_end = offset + count - 1;
 
        for (;;) {
+               if (filp) {
+                       fl.fl_owner = (fl_owner_t)filp;
+                       fl.fl_flags &= ~FL_SLEEP;
+                       error = __posix_lock_file(inode, &fl, NULL);
+                       if (!error)
+                               break;
+               }
+
+               if (sleep)
+                       fl.fl_flags |= FL_SLEEP;
+               fl.fl_owner = current->files;
                error = __posix_lock_file(inode, &fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
@@ -1472,6 +1487,32 @@ int fcntl_getlease(struct file *filp)
        return type;
 }
 
+/**
+ * check_conflicting_open - see if the given dentry points to a file that has
+ *                         an existing open that would conflict with the
+ *                         desired lease.
+ * @dentry:    dentry to check
+ * @arg:       type of lease that we're trying to acquire
+ *
+ * Check to see if there's an existing open fd on this file that would
+ * conflict with the lease we're trying to set.
+ */
+static int
+check_conflicting_open(const struct dentry *dentry, const long arg)
+{
+       int ret = 0;
+       struct inode *inode = dentry->d_inode;
+
+       if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+               return -EAGAIN;
+
+       if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
+           (atomic_read(&inode->i_count) > 1)))
+               ret = -EAGAIN;
+
+       return ret;
+}
+
 static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
 {
        struct file_lock *fl, **before, **my_before = NULL, *lease;
@@ -1499,12 +1540,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
                return -EINVAL;
        }
 
-       error = -EAGAIN;
-       if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
-               goto out;
-       if ((arg == F_WRLCK)
-           && ((d_count(dentry) > 1)
-               || (atomic_read(&inode->i_count) > 1)))
+       error = check_conflicting_open(dentry, arg);
+       if (error)
                goto out;
 
        /*
@@ -1549,7 +1586,19 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
                goto out;
 
        locks_insert_lock(before, lease);
-       error = 0;
+       /*
+        * The check in break_lease() is lockless. It's possible for another
+        * open to race in after we did the earlier check for a conflicting
+        * open but before the lease was inserted. Check again for a
+        * conflicting open and cancel the lease if there is one.
+        *
+        * We also add a barrier here to ensure that the insertion of the lock
+        * precedes these checks.
+        */
+       smp_mb();
+       error = check_conflicting_open(dentry, arg);
+       if (error)
+               locks_unlink_lock(flp);
 out:
        if (is_deleg)
                mutex_unlock(&inode->i_mutex);
@@ -1842,7 +1891,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
 
 static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 {
-       flock->l_pid = fl->fl_pid;
+       flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
 #if BITS_PER_LONG == 32
        /*
         * Make sure we can represent the posix lock via
@@ -1864,7 +1913,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 #if BITS_PER_LONG == 32
 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
 {
-       flock->l_pid = fl->fl_pid;
+       flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid;
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
@@ -1876,7 +1925,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
 /* Report the first existing lock that would conflict with l.
  * This implements the F_GETLK command of fcntl().
  */
-int fcntl_getlk(struct file *filp, struct flock __user *l)
+int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
 {
        struct file_lock file_lock;
        struct flock flock;
@@ -1893,6 +1942,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
        if (error)
                goto out;
 
+       if (cmd == F_GETLKP) {
+               error = -EINVAL;
+               if (flock.l_pid != 0)
+                       goto out;
+
+               cmd = F_GETLK;
+               file_lock.fl_flags |= FL_FILE_PVT;
+               file_lock.fl_owner = (fl_owner_t)filp;
+       }
+
        error = vfs_test_lock(filp, &file_lock);
        if (error)
                goto out;
@@ -2012,25 +2071,32 @@ again:
        error = flock_to_posix_lock(filp, file_lock, &flock);
        if (error)
                goto out;
-       if (cmd == F_SETLKW) {
-               file_lock->fl_flags |= FL_SLEEP;
-       }
-       
-       error = -EBADF;
-       switch (flock.l_type) {
-       case F_RDLCK:
-               if (!(filp->f_mode & FMODE_READ))
-                       goto out;
-               break;
-       case F_WRLCK:
-               if (!(filp->f_mode & FMODE_WRITE))
+
+       /*
+        * If the cmd is requesting file-private locks, then set the
+        * FL_FILE_PVT flag and override the owner.
+        */
+       switch (cmd) {
+       case F_SETLKP:
+               error = -EINVAL;
+               if (flock.l_pid != 0)
                        goto out;
+
+               cmd = F_SETLK;
+               file_lock->fl_flags |= FL_FILE_PVT;
+               file_lock->fl_owner = (fl_owner_t)filp;
                break;
-       case F_UNLCK:
-               break;
-       default:
+       case F_SETLKPW:
                error = -EINVAL;
-               goto out;
+               if (flock.l_pid != 0)
+                       goto out;
+
+               cmd = F_SETLKW;
+               file_lock->fl_flags |= FL_FILE_PVT;
+               file_lock->fl_owner = (fl_owner_t)filp;
+               /* Fallthrough */
+       case F_SETLKW:
+               file_lock->fl_flags |= FL_SLEEP;
        }
 
        error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2061,7 +2127,7 @@ out:
 /* Report the first existing lock that would conflict with l.
  * This implements the F_GETLK command of fcntl().
  */
-int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
+int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
 {
        struct file_lock file_lock;
        struct flock64 flock;
@@ -2078,6 +2144,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
        if (error)
                goto out;
 
+       if (cmd == F_GETLKP) {
+               error = -EINVAL;
+               if (flock.l_pid != 0)
+                       goto out;
+
+               cmd = F_GETLK64;
+               file_lock.fl_flags |= FL_FILE_PVT;
+               file_lock.fl_owner = (fl_owner_t)filp;
+       }
+
        error = vfs_test_lock(filp, &file_lock);
        if (error)
                goto out;
@@ -2130,25 +2206,32 @@ again:
        error = flock64_to_posix_lock(filp, file_lock, &flock);
        if (error)
                goto out;
-       if (cmd == F_SETLKW64) {
-               file_lock->fl_flags |= FL_SLEEP;
-       }
-       
-       error = -EBADF;
-       switch (flock.l_type) {
-       case F_RDLCK:
-               if (!(filp->f_mode & FMODE_READ))
-                       goto out;
-               break;
-       case F_WRLCK:
-               if (!(filp->f_mode & FMODE_WRITE))
+
+       /*
+        * If the cmd is requesting file-private locks, then set the
+        * FL_FILE_PVT flag and override the owner.
+        */
+       switch (cmd) {
+       case F_SETLKP:
+               error = -EINVAL;
+               if (flock.l_pid != 0)
                        goto out;
+
+               cmd = F_SETLK64;
+               file_lock->fl_flags |= FL_FILE_PVT;
+               file_lock->fl_owner = (fl_owner_t)filp;
                break;
-       case F_UNLCK:
-               break;
-       default:
+       case F_SETLKPW:
                error = -EINVAL;
-               goto out;
+               if (flock.l_pid != 0)
+                       goto out;
+
+               cmd = F_SETLKW64;
+               file_lock->fl_flags |= FL_FILE_PVT;
+               file_lock->fl_owner = (fl_owner_t)filp;
+               /* Fallthrough */
+       case F_SETLKW64:
+               file_lock->fl_flags |= FL_SLEEP;
        }
 
        error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2209,7 +2292,7 @@ EXPORT_SYMBOL(locks_remove_posix);
 /*
  * This function is called on the last close of an open file.
  */
-void locks_remove_flock(struct file *filp)
+void locks_remove_file(struct file *filp)
 {
        struct inode * inode = file_inode(filp);
        struct file_lock *fl;
@@ -2218,6 +2301,8 @@ void locks_remove_flock(struct file *filp)
        if (!inode->i_flock)
                return;
 
+       locks_remove_posix(filp, (fl_owner_t)filp);
+
        if (filp->f_op->flock) {
                struct file_lock fl = {
                        .fl_pid = current->tgid,
@@ -2236,16 +2321,28 @@ void locks_remove_flock(struct file *filp)
 
        while ((fl = *before) != NULL) {
                if (fl->fl_file == filp) {
-                       if (IS_FLOCK(fl)) {
-                               locks_delete_lock(before);
-                               continue;
-                       }
                        if (IS_LEASE(fl)) {
                                lease_modify(before, F_UNLCK);
                                continue;
                        }
-                       /* What? */
-                       BUG();
+
+                       /*
+                        * There's a leftover lock on the list of a type that
+                        * we didn't expect to see. Most likely a classic
+                        * POSIX lock that ended up not getting released
+                        * properly, or that raced onto the list somehow. Log
+                        * some info about it and then just remove it from
+                        * the list.
+                        */
+                       WARN(!IS_FLOCK(fl),
+                               "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
+                               MAJOR(inode->i_sb->s_dev),
+                               MINOR(inode->i_sb->s_dev), inode->i_ino,
+                               fl->fl_type, fl->fl_flags,
+                               fl->fl_start, fl->fl_end);
+
+                       locks_delete_lock(before);
+                       continue;
                }
                before = &fl->fl_next;
        }
@@ -2314,8 +2411,14 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 
        seq_printf(f, "%lld:%s ", id, pfx);
        if (IS_POSIX(fl)) {
-               seq_printf(f, "%6s %s ",
-                            (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
+               if (fl->fl_flags & FL_ACCESS)
+                       seq_printf(f, "ACCESS");
+               else if (IS_FILE_PVT(fl))
+                       seq_printf(f, "FLPVT ");
+               else
+                       seq_printf(f, "POSIX ");
+
+               seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" :
                             mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
        } else if (IS_FLOCK(fl)) {
@@ -2385,6 +2488,7 @@ static int locks_show(struct seq_file *f, void *v)
 }
 
 static void *locks_start(struct seq_file *f, loff_t *pos)
+       __acquires(&blocked_lock_lock)
 {
        struct locks_iterator *iter = f->private;
 
@@ -2403,6 +2507,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 }
 
 static void locks_stop(struct seq_file *f, void *v)
+       __releases(&blocked_lock_lock)
 {
        spin_unlock(&blocked_lock_lock);
        lg_global_unlock(&file_lock_lglock);