mm: only enforce stable page writes if the backing device requires it
authorDarrick J. Wong <darrick.wong@oracle.com>
Fri, 22 Feb 2013 00:42:51 +0000 (16:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Feb 2013 01:22:19 +0000 (17:22 -0800)
Create a helper function to check if a backing device requires stable
page writes and, if so, performs the necessary wait.  Then, make it so
that all points in the memory manager that handle making pages writable
use the helper function.  This should provide stable page write support
to most filesystems, while eliminating unnecessary waiting for devices
that don't require the feature.

Before this patchset, all filesystems would block, regardless of whether
or not it was necessary.  ext3 would wait, but still generate occasional
checksum errors.  The network filesystems were left to do their own
thing, so they'd wait too.

After this patchset, all the disk filesystems except ext3 and btrfs will
wait only if the hardware requires it.  ext3 (if necessary) snapshots
pages instead of blocking, and btrfs provides its own bdi so the mm will
never wait.  Network filesystems haven't been touched, so either they
provide their own stable page guarantees or they don't block at all.
The blocking behavior is back to what it was before 3.0 if you don't
have a disk requiring stable page writes.

Here's the result of using dbench to test latency on ext2:

3.8.0-rc3:
 Operation      Count    AvgLat    MaxLat
 ----------------------------------------
 WriteX        109347     0.028    59.817
 ReadX         347180     0.004     3.391
 Flush          15514    29.828   287.283

Throughput 57.429 MB/sec  4 clients  4 procs  max_latency=287.290 ms

3.8.0-rc3 + patches:
 WriteX        105556     0.029     4.273
 ReadX         335004     0.005     4.112
 Flush          14982    30.540   298.634

Throughput 55.4496 MB/sec  4 clients  4 procs  max_latency=298.650 ms

As you can see, the maximum write latency drops considerably with this
patch enabled.  The other filesystems (ext3/ext4/xfs/btrfs) behave
similarly, but see the cover letter for those results.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Artem Bityutskiy <dedekind1@gmail.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/buffer.c
fs/ext4/inode.c
fs/gfs2/file.c
fs/nilfs2/file.c
include/linux/pagemap.h
mm/filemap.c
mm/page-writeback.c

index 7a75c3e0fd5896b7fc59595fff859c41f23f65b1..2ea9cd44aeae00b5f3e9c895d8e41b3e61017cc3 100644 (file)
@@ -2359,7 +2359,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (unlikely(ret < 0))
                goto out_unlock;
        set_page_dirty(page);
-       wait_on_page_writeback(page);
+       wait_for_stable_page(page);
        return 0;
 out_unlock:
        unlock_page(page);
index cbfe13bf5b2aa3f39b4845fe4fce4f45a02b5f43..cd818d8bb221736b9521ef36bcfd85402811e876 100644 (file)
@@ -4968,7 +4968,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
-                       wait_on_page_writeback(page);
+                       wait_for_stable_page(page);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
index 06b7092a3f256eb8dc73bebdfe6574c1970c0fee..2687f50d98cb5a4164bb975d97c05c593fe02587 100644 (file)
@@ -483,7 +483,7 @@ out:
        gfs2_holder_uninit(&gh);
        if (ret == 0) {
                set_page_dirty(page);
-               wait_on_page_writeback(page);
+               wait_for_stable_page(page);
        }
        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(ret);
index 61946883025ce60727811aed90211a3bb20e456f..bec4af6eab13dc93eb91b73c0c7244bd296759e2 100644 (file)
@@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        nilfs_transaction_commit(inode->i_sb);
 
  mapped:
-       wait_on_page_writeback(page);
+       wait_for_stable_page(page);
  out:
        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(ret);
index 6da609d14c159346255e144fe4ab5991e3501046..0e38e13eb2498360b12a0662e994a0d7f9a7af31 100644 (file)
@@ -414,6 +414,7 @@ static inline void wait_on_page_writeback(struct page *page)
 }
 
 extern void end_page_writeback(struct page *page);
+void wait_for_stable_page(struct page *page);
 
 /*
  * Add an arbitrary waiter to a page's wait queue
index 24a7ea583f0cf4039525d824ac047f5f9d7eddb0..c610076c30e137118872f74bec80cfa0592e5f6e 100644 (file)
@@ -1728,6 +1728,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         * see the dirty page and writeprotect it again.
         */
        set_page_dirty(page);
+       wait_for_stable_page(page);
 out:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@ -2274,7 +2275,7 @@ repeat:
                return NULL;
        }
 found:
-       wait_on_page_writeback(page);
+       wait_for_stable_page(page);
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
index 66a0024becd9f6c87f5e437c92ce6852657638a9..355d5ee69058cad4246eab083ea462648030ba3a 100644 (file)
@@ -2290,3 +2290,23 @@ int mapping_tagged(struct address_space *mapping, int tag)
        return radix_tree_tagged(&mapping->page_tree, tag);
 }
 EXPORT_SYMBOL(mapping_tagged);
+
+/**
+ * wait_for_stable_page() - wait for writeback to finish, if necessary.
+ * @page:      The page to wait on.
+ *
+ * This function determines if the given page is related to a backing device
+ * that requires page contents to be held stable during writeback.  If so, then
+ * it will wait for any pending writeback to complete.
+ */
+void wait_for_stable_page(struct page *page)
+{
+       struct address_space *mapping = page_mapping(page);
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+       if (!bdi_cap_stable_pages_required(bdi))
+               return;
+
+       wait_on_page_writeback(page);
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);