[XFS] Reduction global superblock lock contention near ENOSPC.
authorDavid Chinner <dgc@sgi.com>
Sat, 10 Feb 2007 07:35:09 +0000 (18:35 +1100)
committerTim Shimmin <tes@sgi.com>
Sat, 10 Feb 2007 07:35:09 +0000 (18:35 +1100)
The existing per-cpu superblock counter code uses the global superblock
spin lock when we approach ENOSPC for global synchronisation. On larger
machines than this code was originally tested on this can still get
catastrophic spinlock contention due increasing rebalance frequency near
ENOSPC.

By introducing a sleeping lock that is used to serialise balances and
modifications near ENOSPC we prevent contention from needlessly from
wasting the CPU time of potentially hundreds of CPUs.

To reduce the number of balances occuring, we separate the need rebalance
case from the slow allocate case. Now, a counter running dry will trigger
a rebalance during which counters are disabled. Any thread that sees a
disabled counter enters a different path where it waits on the new mutex.
When it gets the new mutex, it checks if the counter is disabled. If the
counter is disabled, then we _know_ that we have to use the global counter
and lock and it is safe to do so immediately. Otherwise, we drop the mutex
and go back to trying the per-cpu counters which we know were re-enabled.

SGI-PV: 952227
SGI-Modid: xfs-linux-melb:xfs-kern:27612a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h

index 397730f570b9719e380e2049edd5511c1ba457fd..37c612ce3d051d33b8f32fa185d8ce6477a6c5e1 100644 (file)
@@ -52,21 +52,19 @@ STATIC void xfs_unmountfs_wait(xfs_mount_t *);
 
 #ifdef HAVE_PERCPU_SB
 STATIC void    xfs_icsb_destroy_counters(xfs_mount_t *);
-STATIC void    xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int);
+STATIC void    xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int,
+int);
 STATIC void    xfs_icsb_sync_counters(xfs_mount_t *);
 STATIC int     xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
                                                int, int);
-STATIC int     xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t,
-                                               int, int);
 STATIC int     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 
 #else
 
 #define xfs_icsb_destroy_counters(mp)                  do { } while (0)
-#define xfs_icsb_balance_counter(mp, a, b)             do { } while (0)
+#define xfs_icsb_balance_counter(mp, a, b, c)          do { } while (0)
 #define xfs_icsb_sync_counters(mp)                     do { } while (0)
 #define xfs_icsb_modify_counters(mp, a, b, c)          do { } while (0)
-#define xfs_icsb_modify_counters_locked(mp, a, b, c)   do { } while (0)
 
 #endif
 
@@ -545,9 +543,11 @@ xfs_readsb(xfs_mount_t *mp, int flags)
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        }
 
-       xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-       xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-       xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+       mutex_lock(&mp->m_icsb_mutex);
+       xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
+       xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
+       xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0);
+       mutex_unlock(&mp->m_icsb_mutex);
 
        mp->m_sb_bp = bp;
        xfs_buf_relse(bp);
@@ -1485,9 +1485,11 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
                case XFS_SBS_IFREE:
                case XFS_SBS_FDBLOCKS:
                        if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                               status = xfs_icsb_modify_counters_locked(mp,
+                               XFS_SB_UNLOCK(mp, s);
+                               status = xfs_icsb_modify_counters(mp,
                                                        msbp->msb_field,
                                                        msbp->msb_delta, rsvd);
+                               s = XFS_SB_LOCK(mp);
                                break;
                        }
                        /* FALLTHROUGH */
@@ -1521,11 +1523,12 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
                        case XFS_SBS_IFREE:
                        case XFS_SBS_FDBLOCKS:
                                if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                                       status =
-                                           xfs_icsb_modify_counters_locked(mp,
+                                       XFS_SB_UNLOCK(mp, s);
+                                       status = xfs_icsb_modify_counters(mp,
                                                        msbp->msb_field,
                                                        -(msbp->msb_delta),
                                                        rsvd);
+                                       s = XFS_SB_LOCK(mp);
                                        break;
                                }
                                /* FALLTHROUGH */
@@ -1733,14 +1736,17 @@ xfs_icsb_cpu_notify(
                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
                break;
        case CPU_ONLINE:
-               xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-               xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-               xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+               mutex_lock(&mp->m_icsb_mutex);
+               xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
+               xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
+               xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0);
+               mutex_unlock(&mp->m_icsb_mutex);
                break;
        case CPU_DEAD:
                /* Disable all the counters, then fold the dead cpu's
                 * count into the total on the global superblock and
                 * re-enable the counters. */
+               mutex_lock(&mp->m_icsb_mutex);
                s = XFS_SB_LOCK(mp);
                xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
                xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
@@ -1752,10 +1758,14 @@ xfs_icsb_cpu_notify(
 
                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
 
-               xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, XFS_ICSB_SB_LOCKED);
-               xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, XFS_ICSB_SB_LOCKED);
-               xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, XFS_ICSB_SB_LOCKED);
+               xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT,
+                                        XFS_ICSB_SB_LOCKED, 0);
+               xfs_icsb_balance_counter(mp, XFS_SBS_IFREE,
+                                        XFS_ICSB_SB_LOCKED, 0);
+               xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS,
+                                        XFS_ICSB_SB_LOCKED, 0);
                XFS_SB_UNLOCK(mp, s);
+               mutex_unlock(&mp->m_icsb_mutex);
                break;
        }
 
@@ -1784,6 +1794,9 @@ xfs_icsb_init_counters(
                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
        }
+
+       mutex_init(&mp->m_icsb_mutex);
+
        /*
         * start with all counters disabled so that the
         * initial balance kicks us off correctly
@@ -1888,6 +1901,17 @@ xfs_icsb_disable_counter(
 
        ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
 
+       /*
+        * If we are already disabled, then there is nothing to do
+        * here. We check before locking all the counters to avoid
+        * the expensive lock operation when being called in the
+        * slow path and the counter is already disabled. This is
+        * safe because the only time we set or clear this state is under
+        * the m_icsb_mutex.
+        */
+       if (xfs_icsb_counter_disabled(mp, field))
+               return 0;
+
        xfs_icsb_lock_all_counters(mp);
        if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
                /* drain back to superblock */
@@ -1997,24 +2021,33 @@ xfs_icsb_sync_counters_lazy(
 /*
  * Balance and enable/disable counters as necessary.
  *
- * Thresholds for re-enabling counters are somewhat magic.
- * inode counts are chosen to be the same number as single
- * on disk allocation chunk per CPU, and free blocks is
- * something far enough zero that we aren't going thrash
- * when we get near ENOSPC.
+ * Thresholds for re-enabling counters are somewhat magic.  inode counts are
+ * chosen to be the same number as single on disk allocation chunk per CPU, and
+ * free blocks is something far enough zero that we aren't going thrash when we
+ * get near ENOSPC. We also need to supply a minimum we require per cpu to
+ * prevent looping endlessly when xfs_alloc_space asks for more than will
+ * be distributed to a single CPU but each CPU has enough blocks to be
+ * reenabled.
+ *
+ * Note that we can be called when counters are already disabled.
+ * xfs_icsb_disable_counter() optimises the counter locking in this case to
+ * prevent locking every per-cpu counter needlessly.
  */
-#define XFS_ICSB_INO_CNTR_REENABLE     64
+
+#define XFS_ICSB_INO_CNTR_REENABLE     (uint64_t)64
 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
-               (512 + XFS_ALLOC_SET_ASIDE(mp))
+               (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
 STATIC void
 xfs_icsb_balance_counter(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field,
-       int             flags)
+       int             flags,
+       int             min_per_cpu)
 {
        uint64_t        count, resid;
        int             weight = num_online_cpus();
        int             s;
+       uint64_t        min = (uint64_t)min_per_cpu;
 
        if (!(flags & XFS_ICSB_SB_LOCKED))
                s = XFS_SB_LOCK(mp);
@@ -2027,19 +2060,19 @@ xfs_icsb_balance_counter(
        case XFS_SBS_ICOUNT:
                count = mp->m_sb.sb_icount;
                resid = do_div(count, weight);
-               if (count < XFS_ICSB_INO_CNTR_REENABLE)
+               if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
                        goto out;
                break;
        case XFS_SBS_IFREE:
                count = mp->m_sb.sb_ifree;
                resid = do_div(count, weight);
-               if (count < XFS_ICSB_INO_CNTR_REENABLE)
+               if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
                        goto out;
                break;
        case XFS_SBS_FDBLOCKS:
                count = mp->m_sb.sb_fdblocks;
                resid = do_div(count, weight);
-               if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp))
+               if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
                        goto out;
                break;
        default:
@@ -2054,32 +2087,39 @@ out:
                XFS_SB_UNLOCK(mp, s);
 }
 
-STATIC int
-xfs_icsb_modify_counters_int(
+int
+xfs_icsb_modify_counters(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field,
        int             delta,
-       int             rsvd,
-       int             flags)
+       int             rsvd)
 {
        xfs_icsb_cnts_t *icsbp;
        long long       lcounter;       /* long counter for 64 bit fields */
-       int             cpu, s, locked = 0;
-       int             ret = 0, balance_done = 0;
+       int             cpu, ret = 0, s;
 
+       might_sleep();
 again:
        cpu = get_cpu();
-       icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu),
-       xfs_icsb_lock_cntr(icsbp);
+       icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu);
+
+       /*
+        * if the counter is disabled, go to slow path
+        */
        if (unlikely(xfs_icsb_counter_disabled(mp, field)))
                goto slow_path;
+       xfs_icsb_lock_cntr(icsbp);
+       if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
+               xfs_icsb_unlock_cntr(icsbp);
+               goto slow_path;
+       }
 
        switch (field) {
        case XFS_SBS_ICOUNT:
                lcounter = icsbp->icsb_icount;
                lcounter += delta;
                if (unlikely(lcounter < 0))
-                       goto slow_path;
+                       goto balance_counter;
                icsbp->icsb_icount = lcounter;
                break;
 
@@ -2087,7 +2127,7 @@ again:
                lcounter = icsbp->icsb_ifree;
                lcounter += delta;
                if (unlikely(lcounter < 0))
-                       goto slow_path;
+                       goto balance_counter;
                icsbp->icsb_ifree = lcounter;
                break;
 
@@ -2097,7 +2137,7 @@ again:
                lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
                lcounter += delta;
                if (unlikely(lcounter < 0))
-                       goto slow_path;
+                       goto balance_counter;
                icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
                break;
        default:
@@ -2106,72 +2146,78 @@ again:
        }
        xfs_icsb_unlock_cntr(icsbp);
        put_cpu();
-       if (locked)
-               XFS_SB_UNLOCK(mp, s);
        return 0;
 
-       /*
-        * The slow path needs to be run with the SBLOCK
-        * held so that we prevent other threads from
-        * attempting to run this path at the same time.
-        * this provides exclusion for the balancing code,
-        * and exclusive fallback if the balance does not
-        * provide enough resources to continue in an unlocked
-        * manner.
-        */
 slow_path:
-       xfs_icsb_unlock_cntr(icsbp);
        put_cpu();
 
-       /* need to hold superblock incase we need
-        * to disable a counter */
-       if (!(flags & XFS_ICSB_SB_LOCKED)) {
-               s = XFS_SB_LOCK(mp);
-               locked = 1;
-               flags |= XFS_ICSB_SB_LOCKED;
-       }
-       if (!balance_done) {
-               xfs_icsb_balance_counter(mp, field, flags);
-               balance_done = 1;
+       /*
+        * serialise with a mutex so we don't burn lots of cpu on
+        * the superblock lock. We still need to hold the superblock
+        * lock, however, when we modify the global structures.
+        */
+       mutex_lock(&mp->m_icsb_mutex);
+
+       /*
+        * Now running atomically.
+        *
+        * If the counter is enabled, someone has beaten us to rebalancing.
+        * Drop the lock and try again in the fast path....
+        */
+       if (!(xfs_icsb_counter_disabled(mp, field))) {
+               mutex_unlock(&mp->m_icsb_mutex);
                goto again;
-       } else {
-               /*
-                * we might not have enough on this local
-                * cpu to allocate for a bulk request.
-                * We need to drain this field from all CPUs
-                * and disable the counter fastpath
-                */
-               xfs_icsb_disable_counter(mp, field);
        }
 
+       /*
+        * The counter is currently disabled. Because we are
+        * running atomically here, we know a rebalance cannot
+        * be in progress. Hence we can go straight to operating
+        * on the global superblock. We do not call xfs_mod_incore_sb()
+        * here even though we need to get the SB_LOCK. Doing so
+        * will cause us to re-enter this function and deadlock.
+        * Hence we get the SB_LOCK ourselves and then call
+        * xfs_mod_incore_sb_unlocked() as the unlocked path operates
+        * directly on the global counters.
+        */
+       s = XFS_SB_LOCK(mp);
        ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+       XFS_SB_UNLOCK(mp, s);
 
-       if (locked)
-               XFS_SB_UNLOCK(mp, s);
+       /*
+        * Now that we've modified the global superblock, we
+        * may be able to re-enable the distributed counters
+        * (e.g. lots of space just got freed). After that
+        * we are done.
+        */
+       if (ret != ENOSPC)
+               xfs_icsb_balance_counter(mp, field, 0, 0);
+       mutex_unlock(&mp->m_icsb_mutex);
        return ret;
-}
 
-STATIC int
-xfs_icsb_modify_counters(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field,
-       int             delta,
-       int             rsvd)
-{
-       return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0);
-}
+balance_counter:
+       xfs_icsb_unlock_cntr(icsbp);
+       put_cpu();
 
-/*
- * Called when superblock is already locked
- */
-STATIC int
-xfs_icsb_modify_counters_locked(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field,
-       int             delta,
-       int             rsvd)
-{
-       return xfs_icsb_modify_counters_int(mp, field, delta,
-                                               rsvd, XFS_ICSB_SB_LOCKED);
+       /*
+        * We may have multiple threads here if multiple per-cpu
+        * counters run dry at the same time. This will mean we can
+        * do more balances than strictly necessary but it is not
+        * the common slowpath case.
+        */
+       mutex_lock(&mp->m_icsb_mutex);
+
+       /*
+        * running atomically.
+        *
+        * This will leave the counter in the correct state for future
+        * accesses. After the rebalance, we simply try again and our retry
+        * will either succeed through the fast path or slow path without
+        * another balance operation being required.
+        */
+       xfs_icsb_balance_counter(mp, field, 0, delta);
+       mutex_unlock(&mp->m_icsb_mutex);
+       goto again;
 }
+
 #endif
index e5f396ff9a3d57f45bb3174449f9cfbd752adf19..a2295df61d2e60846a6070d0a4a4cc31c38f1186 100644 (file)
@@ -419,6 +419,7 @@ typedef struct xfs_mount {
        xfs_icsb_cnts_t         *m_sb_cnts;     /* per-cpu superblock counters */
        unsigned long           m_icsb_counters; /* disabled per-cpu counters */
        struct notifier_block   m_icsb_notifier; /* hotplug cpu notifier */
+       struct mutex            m_icsb_mutex;   /* balancer sync lock */
 #endif
 } xfs_mount_t;