dax: support dirty DAX entries in radix tree
authorRoss Zwisler <ross.zwisler@linux.intel.com>
Fri, 22 Jan 2016 23:10:40 +0000 (15:10 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 23 Jan 2016 01:02:18 +0000 (17:02 -0800)
Add support for tracking dirty DAX entries in the struct address_space
radix tree.  This tree is already used for dirty page writeback, and it
already supports the use of exceptional (non struct page*) entries.

In order to properly track dirty DAX pages we will insert new
exceptional entries into the radix tree that represent dirty DAX PTE or
PMD pages.  These exceptional entries will also contain the writeback
addresses for the PTE or PMD faults that we can use at fsync/msync time.

There are currently two types of exceptional entries (shmem and shadow)
that can be placed into the radix tree, and this adds a third.  We rely
on the fact that only one type of exceptional entry can be found in a
given radix tree based on its usage.  This happens for free with DAX vs
shmem but we explicitly prevent shadow entries from being added to radix
trees for DAX mappings.

The only shadow entries that would be generated for DAX radix trees
would be to track zero page mappings that were created for holes.  These
pages would receive minimal benefit from having shadow entries, and the
choice to have only one type of exceptional entry in a given radix tree
makes the logic simpler both in clear_exceptional_entry() and in the
rest of DAX.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: Jeff Layton <jlayton@poochiereds.net>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/block_dev.c
fs/inode.c
include/linux/dax.h
include/linux/fs.h
include/linux/radix-tree.h
mm/filemap.c
mm/truncate.c
mm/vmscan.c
mm/workingset.c

index ba762ea07f679c9bafd078bb316270a38e9a7d80..60895e500e151f754a7b01dd4d5aa9c6940d4d3a 100644 (file)
@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-       if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                return;
 
        invalidate_bh_lrus();
index e491e54d243025999e42fdec674b082a765188fd..1e6dd388ba7fd7ad472f87545add408385b3a2f8 100644 (file)
@@ -495,7 +495,7 @@ void clear_inode(struct inode *inode)
         */
        spin_lock_irq(&inode->i_data.tree_lock);
        BUG_ON(inode->i_data.nrpages);
-       BUG_ON(inode->i_data.nrshadows);
+       BUG_ON(inode->i_data.nrexceptional);
        spin_unlock_irq(&inode->i_data.tree_lock);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
index b415e521528de3d5fc5db598fb588f688f42d307..e9d57f680f5034c5b1182ae8ca77cd82956c42a8 100644 (file)
@@ -36,4 +36,9 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 }
+
+static inline bool dax_mapping(struct address_space *mapping)
+{
+       return mapping->host && IS_DAX(mapping->host);
+}
 #endif
index eb73d74ed99262c83f2fa5e51582b2da0344bafa..0d7570320d63c7265c3b590bb9bd0bcdf868432b 100644 (file)
@@ -433,7 +433,8 @@ struct address_space {
        struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
        /* Protected by tree_lock together with the radix tree */
        unsigned long           nrpages;        /* number of total pages */
-       unsigned long           nrshadows;      /* number of shadow entries */
+       /* number of shadow or DAX exceptional entries */
+       unsigned long           nrexceptional;
        pgoff_t                 writeback_index;/* writeback starts here */
        const struct address_space_operations *a_ops;   /* methods */
        unsigned long           flags;          /* error bits/gfp mask */
index 57e7d87d2d4c15de691d8fa3075f779ddbda5172..7c88ad156a293c0bedcea771b58f3113909b6532 100644 (file)
 #define RADIX_TREE_EXCEPTIONAL_ENTRY   2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT   2
 
+#define RADIX_DAX_MASK 0xf
+#define RADIX_DAX_SHIFT        4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+               RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
        return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
index 847ee43c28068a0fb744fe124c1d8afa429d0719..7b8be78cfd9e6e09b8ec62de15b433074901b99a 100644 (file)
@@ -11,6 +11,7 @@
  */
 #include <linux/export.h>
 #include <linux/compiler.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/capability.h>
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
        __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
 
        if (shadow) {
-               mapping->nrshadows++;
+               mapping->nrexceptional++;
                /*
-                * Make sure the nrshadows update is committed before
+                * Make sure the nrexceptional update is committed before
                 * the nrpages update so that final truncate racing
                 * with reclaim does not see both counters 0 at the
                 * same time and miss a shadow entry.
@@ -579,9 +580,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
                p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
                if (!radix_tree_exceptional_entry(p))
                        return -EEXIST;
+
+               if (WARN_ON(dax_mapping(mapping)))
+                       return -EINVAL;
+
                if (shadowp)
                        *shadowp = p;
-               mapping->nrshadows--;
+               mapping->nrexceptional--;
                if (node)
                        workingset_node_shadows_dec(node);
        }
@@ -1245,9 +1250,9 @@ repeat:
                        if (radix_tree_deref_retry(page))
                                goto restart;
                        /*
-                        * A shadow entry of a recently evicted page,
-                        * or a swap entry from shmem/tmpfs.  Return
-                        * it without attempting to raise page count.
+                        * A shadow entry of a recently evicted page, a swap
+                        * entry from shmem/tmpfs or a DAX entry.  Return it
+                        * without attempting to raise page count.
                         */
                        goto export;
                }
index 76e35ad971025ce5eb3781543537d1bf3b947b8d..e3ee0e27cd17f20d1d35f1acdeab18136ccc1e29 100644 (file)
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/backing-dev.h>
+#include <linux/dax.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
                return;
 
        spin_lock_irq(&mapping->tree_lock);
-       /*
-        * Regular page slots are stabilized by the page lock even
-        * without the tree itself locked.  These unlocked entries
-        * need verification under the tree lock.
-        */
-       if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
-               goto unlock;
-       if (*slot != entry)
-               goto unlock;
-       radix_tree_replace_slot(slot, NULL);
-       mapping->nrshadows--;
-       if (!node)
-               goto unlock;
-       workingset_node_shadows_dec(node);
-       /*
-        * Don't track node without shadow entries.
-        *
-        * Avoid acquiring the list_lru lock if already untracked.
-        * The list_empty() test is safe as node->private_list is
-        * protected by mapping->tree_lock.
-        */
-       if (!workingset_node_shadows(node) &&
-           !list_empty(&node->private_list))
-               list_lru_del(&workingset_shadow_nodes, &node->private_list);
-       __radix_tree_delete_node(&mapping->page_tree, node);
+
+       if (dax_mapping(mapping)) {
+               if (radix_tree_delete_item(&mapping->page_tree, index, entry))
+                       mapping->nrexceptional--;
+       } else {
+               /*
+                * Regular page slots are stabilized by the page lock even
+                * without the tree itself locked.  These unlocked entries
+                * need verification under the tree lock.
+                */
+               if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+                                       &slot))
+                       goto unlock;
+               if (*slot != entry)
+                       goto unlock;
+               radix_tree_replace_slot(slot, NULL);
+               mapping->nrexceptional--;
+               if (!node)
+                       goto unlock;
+               workingset_node_shadows_dec(node);
+               /*
+                * Don't track node without shadow entries.
+                *
+                * Avoid acquiring the list_lru lock if already untracked.
+                * The list_empty() test is safe as node->private_list is
+                * protected by mapping->tree_lock.
+                */
+               if (!workingset_node_shadows(node) &&
+                   !list_empty(&node->private_list))
+                       list_lru_del(&workingset_shadow_nodes,
+                                       &node->private_list);
+               __radix_tree_delete_node(&mapping->page_tree, node);
+       }
 unlock:
        spin_unlock_irq(&mapping->tree_lock);
 }
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        int             i;
 
        cleancache_invalidate_inode(mapping);
-       if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                return;
 
        /* Offsets within partial pages */
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
  */
 void truncate_inode_pages_final(struct address_space *mapping)
 {
-       unsigned long nrshadows;
+       unsigned long nrexceptional;
        unsigned long nrpages;
 
        /*
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
 
        /*
         * When reclaim installs eviction entries, it increases
-        * nrshadows first, then decreases nrpages.  Make sure we see
+        * nrexceptional first, then decreases nrpages.  Make sure we see
         * this in the right order or we might miss an entry.
         */
        nrpages = mapping->nrpages;
        smp_rmb();
-       nrshadows = mapping->nrshadows;
+       nrexceptional = mapping->nrexceptional;
 
-       if (nrpages || nrshadows) {
+       if (nrpages || nrexceptional) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
index bd620b65db52680fc8a6e221fe90189c22dca2f1..eb3dd37ccd7c727dcc0b6030f62c183097b956bb 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/oom.h>
 #include <linux/prefetch.h>
 #include <linux/printk.h>
+#include <linux/dax.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 * inode reclaim needs to empty out the radix tree or
                 * the nodes are lost.  Don't plant shadows behind its
                 * back.
+                *
+                * We also don't store shadows for DAX mappings because the
+                * only page cache pages found in these are zero pages
+                * covering holes, and because we don't want to mix DAX
+                * exceptional entries and shadow exceptional entries in the
+                * same page_tree.
                 */
                if (reclaimed && page_is_file_cache(page) &&
-                   !mapping_exiting(mapping))
+                   !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(mapping, page);
                __delete_from_page_cache(page, shadow, memcg);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
index aa017133744b227bed7592ea6cc32f360c3e142c..61ead9e5549df171f43fa53936a4b8cbd86a21a5 100644 (file)
@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                        node->slots[i] = NULL;
                        BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
                        node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
-                       BUG_ON(!mapping->nrshadows);
-                       mapping->nrshadows--;
+                       BUG_ON(!mapping->nrexceptional);
+                       mapping->nrexceptional--;
                }
        }
        BUG_ON(node->count);