Merge branch 'for-3.20/bdi' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Feb 2015 21:50:21 +0000 (13:50 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Feb 2015 21:50:21 +0000 (13:50 -0800)
Pull backing device changes from Jens Axboe:
 "This contains a cleanup of how the backing device is handled, in
  preparation for a rework of the life time rules.  In this part, the
  most important change is to split the unrelated nommu mmap flags from
  it, but also removing a backing_dev_info pointer from the
  address_space (and inode), and a cleanup of other various minor bits.

  Christoph did all the work here, I just fixed an oops with pages that
  have a swap backing.  Arnd fixed a missing export, and Oleg killed the
  lustre backing_dev_info from staging.  Last patch was from Al,
  unexporting parts that are now no longer needed outside"

* 'for-3.20/bdi' of git://git.kernel.dk/linux-block:
  Make super_blocks and sb_lock static
  mtd: export new mtd_mmap_capabilities
  fs: make inode_to_bdi() handle NULL inode
  staging/lustre/llite: get rid of backing_dev_info
  fs: remove default_backing_dev_info
  fs: don't reassign dirty inodes to default_backing_dev_info
  nfs: don't call bdi_unregister
  ceph: remove call to bdi_unregister
  fs: remove mapping->backing_dev_info
  fs: export inode_to_bdi and use it in favor of mapping->backing_dev_info
  nilfs2: set up s_bdi like the generic mount_bdev code
  block_dev: get bdev inode bdi directly from the block device
  block_dev: only write bdev inode on close
  fs: introduce f_op->mmap_capabilities for nommu mmap support
  fs: kill BDI_CAP_SWAP_BACKED
  fs: deduplicate noop_backing_dev_info

24 files changed:
1  2 
fs/aio.c
fs/btrfs/file.c
fs/ext4/super.c
fs/fuse/file.c
fs/gfs2/glock.c
fs/inode.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs4super.c
fs/nfs/super.c
fs/nfs/write.c
fs/ocfs2/file.c
fs/xfs/xfs_file.c
include/linux/fs.h
mm/filemap.c
mm/filemap_xip.c
mm/madvise.c
mm/nommu.c
mm/page-writeback.c
mm/shmem.c
mm/swap.c
mm/vmscan.c

diff --combined fs/aio.c
index c428871f10934a87aa4b5a7038b8c0519b6bc80a,3bf8b1d250c3483ad418a00b9bea052cdc810366..118a2e0088d8fdd8391654a44edb06157dad5629
+++ b/fs/aio.c
@@@ -165,15 -165,6 +165,6 @@@ static struct vfsmount *aio_mnt
  static const struct file_operations aio_ring_fops;
  static const struct address_space_operations aio_ctx_aops;
  
- /* Backing dev info for aio fs.
-  * -no dirty page accounting or writeback happens
-  */
- static struct backing_dev_info aio_fs_backing_dev_info = {
-       .name           = "aiofs",
-       .state          = 0,
-       .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
- };
  static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
  {
        struct qstr this = QSTR_INIT("[aio]", 5);
  
        inode->i_mapping->a_ops = &aio_ctx_aops;
        inode->i_mapping->private_data = ctx;
-       inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
        inode->i_size = PAGE_SIZE * nr_pages;
  
        path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@@ -230,9 -220,6 +220,6 @@@ static int __init aio_setup(void
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
  
-       if (bdi_init(&aio_fs_backing_dev_info))
-               panic("Failed to init aio fs backing dev info.");
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
  
@@@ -1140,13 -1127,6 +1127,13 @@@ static long aio_read_events_ring(struc
        long ret = 0;
        int copy_ret;
  
 +      /*
 +       * The mutex can block and wake us up and that will cause
 +       * wait_event_interruptible_hrtimeout() to schedule without sleeping
 +       * and repeat. This should be rare enough that it doesn't cause
 +       * peformance issues. See the comment in read_events() for more detail.
 +       */
 +      sched_annotate_sleep();
        mutex_lock(&ctx->ring_lock);
  
        /* Access to ->ring_pages here is protected by ctx->ring_lock. */
diff --combined fs/btrfs/file.c
index a606ab551296e150f0b8045c937a8df0dab4dc58,835c04a874fd8bc65f8f1c3c3b897304d9860f59..b78bbbac900db833e54fc63bdbff8bae365225f3
@@@ -1746,7 -1746,7 +1746,7 @@@ static ssize_t btrfs_file_write_iter(st
  
        mutex_lock(&inode->i_mutex);
  
-       current->backing_dev_info = inode->i_mapping->backing_dev_info;
+       current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err) {
                mutex_unlock(&inode->i_mutex);
@@@ -2081,6 -2081,7 +2081,6 @@@ static const struct vm_operations_struc
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = btrfs_page_mkwrite,
 -      .remap_pages    = generic_file_remap_pages,
  };
  
  static int btrfs_file_mmap(struct file        *filp, struct vm_area_struct *vma)
diff --combined fs/ext4/super.c
index ac64edbe501d9f2aea8bb450506f02b0709314bb,ad88e601a6cde34740bf519a3307432315f80423..64c39c7c594f723fdcf15a2c550609777d3b68f1
@@@ -334,7 -334,7 +334,7 @@@ static void save_error_info(struct supe
  static int block_device_ejected(struct super_block *sb)
  {
        struct inode *bd_inode = sb->s_bdev->bd_inode;
-       struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+       struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
  
        return bdi->dev == NULL;
  }
@@@ -1046,7 -1046,10 +1046,7 @@@ static int ext4_mark_dquot_dirty(struc
  static int ext4_write_info(struct super_block *sb, int type);
  static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         struct path *path);
 -static int ext4_quota_on_sysfile(struct super_block *sb, int type,
 -                               int format_id);
  static int ext4_quota_off(struct super_block *sb, int type);
 -static int ext4_quota_off_sysfile(struct super_block *sb, int type);
  static int ext4_quota_on_mount(struct super_block *sb, int type);
  static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@@ -1081,6 -1084,16 +1081,6 @@@ static const struct quotactl_ops ext4_q
        .get_dqblk      = dquot_get_dqblk,
        .set_dqblk      = dquot_set_dqblk
  };
 -
 -static const struct quotactl_ops ext4_qctl_sysfile_operations = {
 -      .quota_on_meta  = ext4_quota_on_sysfile,
 -      .quota_off      = ext4_quota_off_sysfile,
 -      .quota_sync     = dquot_quota_sync,
 -      .get_info       = dquot_get_dqinfo,
 -      .set_info       = dquot_set_dqinfo,
 -      .get_dqblk      = dquot_get_dqblk,
 -      .set_dqblk      = dquot_set_dqblk
 -};
  #endif
  
  static const struct super_operations ext4_sops = {
@@@ -3922,7 -3935,7 +3922,7 @@@ static int ext4_fill_super(struct super
  #ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
 -              sb->s_qcop = &ext4_qctl_sysfile_operations;
 +              sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@@ -5275,6 -5288,21 +5275,6 @@@ static int ext4_enable_quotas(struct su
        return 0;
  }
  
 -/*
 - * quota_on function that is used when QUOTA feature is set.
 - */
 -static int ext4_quota_on_sysfile(struct super_block *sb, int type,
 -                               int format_id)
 -{
 -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
 -              return -EINVAL;
 -
 -      /*
 -       * USAGE was enabled at mount time. Only need to enable LIMITS now.
 -       */
 -      return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
 -}
 -
  static int ext4_quota_off(struct super_block *sb, int type)
  {
        struct inode *inode = sb_dqopt(sb)->files[type];
@@@ -5301,6 -5329,18 +5301,6 @@@ out
        return dquot_quota_off(sb, type);
  }
  
 -/*
 - * quota_off function that is used when QUOTA feature is set.
 - */
 -static int ext4_quota_off_sysfile(struct super_block *sb, int type)
 -{
 -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
 -              return -EINVAL;
 -
 -      /* Disable only the limits. */
 -      return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
 -}
 -
  /* Read data from quotafile - avoid pagecache and such because we cannot afford
   * acquiring the locks... As quota files are never truncated and quota code
   * itself serializes the operations (and no one else should touch the files)
diff --combined fs/fuse/file.c
index d769e594855b0ac6192925c4a6f4f23b03db4ca7,19d80b82d344890b67644e4b716daad9e12377e1..c01ec3bdcfd81090fae2cb26ae166f351d4505eb
@@@ -1159,7 -1159,7 +1159,7 @@@ static ssize_t fuse_file_write_iter(str
        mutex_lock(&inode->i_mutex);
  
        /* We can write back this queue in page reclaim */
-       current->backing_dev_info = mapping->backing_dev_info;
+       current->backing_dev_info = inode_to_bdi(inode);
  
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@@ -1464,7 -1464,7 +1464,7 @@@ static void fuse_writepage_finish(struc
  {
        struct inode *inode = req->inode;
        struct fuse_inode *fi = get_fuse_inode(inode);
-       struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
        int i;
  
        list_del(&req->writepages_entry);
@@@ -1658,7 -1658,7 +1658,7 @@@ static int fuse_writepage_locked(struc
        req->end = fuse_writepage_end;
        req->inode = inode;
  
-       inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+       inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
  
        spin_lock(&fc->lock);
@@@ -1768,7 -1768,7 +1768,7 @@@ static bool fuse_writepage_in_flight(st
  
        if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
                                        old_req->state == FUSE_REQ_PENDING)) {
-               struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+               struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
  
                copy_highpage(old_req->pages[0], page);
                spin_unlock(&fc->lock);
@@@ -1872,7 -1872,7 +1872,7 @@@ static int fuse_writepages_fill(struct 
        req->page_descs[req->num_pages].offset = 0;
        req->page_descs[req->num_pages].length = PAGE_SIZE;
  
-       inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+       inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
  
        err = 0;
@@@ -2062,6 -2062,7 +2062,6 @@@ static const struct vm_operations_struc
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = fuse_page_mkwrite,
 -      .remap_pages    = generic_file_remap_pages,
  };
  
  static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --combined fs/gfs2/glock.c
index aeb7bc958a18a182bc7a705b878fdc64ce0240f2,08ea717981f788f7422e3b71e81eec45f6247825..f42dffba056ab0e6908788084f1c449e1c2683e6
@@@ -173,14 -173,19 +173,14 @@@ void gfs2_glock_add_to_lru(struct gfs2_
        spin_unlock(&lru_lock);
  }
  
 -static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
  {
 +      spin_lock(&lru_lock);
        if (!list_empty(&gl->gl_lru)) {
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
                clear_bit(GLF_LRU, &gl->gl_flags);
        }
 -}
 -
 -static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 -{
 -      spin_lock(&lru_lock);
 -      __gfs2_glock_remove_from_lru(gl);
        spin_unlock(&lru_lock);
  }
  
@@@ -200,7 -205,9 +200,7 @@@ void gfs2_glock_put(struct gfs2_glock *
  
        lockref_mark_dead(&gl->gl_lockref);
  
 -      spin_lock(&lru_lock);
 -      __gfs2_glock_remove_from_lru(gl);
 -      spin_unlock(&lru_lock);
 +      gfs2_glock_remove_from_lru(gl);
        spin_unlock(&gl->gl_lockref.lock);
        spin_lock_bucket(gl->gl_hash);
        hlist_bl_del_rcu(&gl->gl_list);
@@@ -768,7 -775,6 +768,6 @@@ int gfs2_glock_get(struct gfs2_sbd *sdp
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_NOFS);
                mapping->private_data = NULL;
-               mapping->backing_dev_info = s->s_bdi;
                mapping->writeback_index = 0;
        }
  
diff --combined fs/inode.c
index 3a53b1da3fb8da0d0de438d9195bf07f2b64c909,e4e8caa7464cc78b5d0d5b97a2198edbd8a97470..b7871577571d2bbadad346b0af566ee3e8e93b81
@@@ -170,20 -170,7 +170,7 @@@ int inode_init_always(struct super_bloc
        atomic_set(&mapping->i_mmap_writable, 0);
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->private_data = NULL;
-       mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
-       /*
-        * If the block_device provides a backing_dev_info for client
-        * inodes then use that.  Otherwise the inode share the bdev's
-        * backing_dev_info.
-        */
-       if (sb->s_bdev) {
-               struct backing_dev_info *bdi;
-               bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-               mapping->backing_dev_info = bdi;
-       }
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);      /* buggered by rcu freeing */
  #ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
  #endif
 -
 +      inode->i_flctx = NULL;
        this_cpu_inc(nr_inodes);
  
        return 0;
@@@ -237,7 -224,6 +224,7 @@@ void __destroy_inode(struct inode *inod
        BUG_ON(inode_has_buffers(inode));
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
 +      locks_free_lock_context(inode->i_flctx);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
@@@ -356,6 -342,7 +343,6 @@@ void address_space_init_once(struct add
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        mapping->i_mmap = RB_ROOT;
 -      INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
  }
  EXPORT_SYMBOL(address_space_init_once);
  
index 3c9769441f3641199ac006b5c6427faf0cd56531,51aa889611cf290832745fa0094e2cab6c60423a..7ae1c263c5cf03b8d63f1fec359d794174564020
@@@ -118,6 -118,13 +118,6 @@@ static void filelayout_reset_read(struc
        }
  }
  
 -static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
 -{
 -      if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 -              return;
 -      pnfs_return_layout(inode);
 -}
 -
  static int filelayout_async_handle_error(struct rpc_task *task,
                                         struct nfs4_state *state,
                                         struct nfs_client *clp,
                dprintk("%s DS connection error %d\n", __func__,
                        task->tk_status);
                nfs4_mark_deviceid_unavailable(devid);
 -              set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
 +              pnfs_error_mark_layout_for_return(inode, lseg);
                rpc_wake_up(&tbl->slot_tbl_waitq);
                /* fall through */
        default:
@@@ -332,6 -339,16 +332,6 @@@ static void filelayout_read_count_stats
        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
  }
  
 -static void filelayout_read_release(void *data)
 -{
 -      struct nfs_pgio_header *hdr = data;
 -      struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 -
 -      filelayout_fenceme(lo->plh_inode, lo);
 -      nfs_put_client(hdr->ds_clp);
 -      hdr->mds_ops->rpc_release(data);
 -}
 -
  static int filelayout_write_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
  {
        return 0;
  }
  
 -/* Fake up some data that will cause nfs_commit_release to retry the writes. */
 -static void prepare_to_resend_writes(struct nfs_commit_data *data)
 -{
 -      struct nfs_page *first = nfs_list_entry(data->pages.next);
 -
 -      data->task.tk_status = 0;
 -      memcpy(&data->verf.verifier, &first->wb_verf,
 -             sizeof(data->verf.verifier));
 -      data->verf.verifier.data[0]++; /* ensure verifier mismatch */
 -}
 -
  static int filelayout_commit_done_cb(struct rpc_task *task,
                                     struct nfs_commit_data *data)
  {
  
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
 -              prepare_to_resend_writes(data);
 +              pnfs_generic_prepare_to_resend_writes(data);
                return -EAGAIN;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@@ -423,6 -451,16 +423,6 @@@ static void filelayout_write_count_stat
        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
  }
  
 -static void filelayout_write_release(void *data)
 -{
 -      struct nfs_pgio_header *hdr = data;
 -      struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
 -
 -      filelayout_fenceme(lo->plh_inode, lo);
 -      nfs_put_client(hdr->ds_clp);
 -      hdr->mds_ops->rpc_release(data);
 -}
 -
  static void filelayout_commit_prepare(struct rpc_task *task, void *data)
  {
        struct nfs_commit_data *wdata = data;
                        task);
  }
  
 -static void filelayout_write_commit_done(struct rpc_task *task, void *data)
 -{
 -      struct nfs_commit_data *wdata = data;
 -
 -      /* Note this may cause RPC to be resent */
 -      wdata->mds_ops->rpc_call_done(task, data);
 -}
 -
  static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
  {
        struct nfs_commit_data *cdata = data;
        rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
  }
  
 -static void filelayout_commit_release(void *calldata)
 -{
 -      struct nfs_commit_data *data = calldata;
 -
 -      data->completion_ops->completion(data);
 -      pnfs_put_lseg(data->lseg);
 -      nfs_put_client(data->ds_clp);
 -      nfs_commitdata_release(data);
 -}
 -
  static const struct rpc_call_ops filelayout_read_call_ops = {
        .rpc_call_prepare = filelayout_read_prepare,
        .rpc_call_done = filelayout_read_call_done,
        .rpc_count_stats = filelayout_read_count_stats,
 -      .rpc_release = filelayout_read_release,
 +      .rpc_release = pnfs_generic_rw_release,
  };
  
  static const struct rpc_call_ops filelayout_write_call_ops = {
        .rpc_call_prepare = filelayout_write_prepare,
        .rpc_call_done = filelayout_write_call_done,
        .rpc_count_stats = filelayout_write_count_stats,
 -      .rpc_release = filelayout_write_release,
 +      .rpc_release = pnfs_generic_rw_release,
  };
  
  static const struct rpc_call_ops filelayout_commit_call_ops = {
        .rpc_call_prepare = filelayout_commit_prepare,
 -      .rpc_call_done = filelayout_write_commit_done,
 +      .rpc_call_done = pnfs_generic_write_commit_done,
        .rpc_count_stats = filelayout_commit_count_stats,
 -      .rpc_release = filelayout_commit_release,
 +      .rpc_release = pnfs_generic_commit_release,
  };
  
  static enum pnfs_try_status
@@@ -492,7 -548,7 +492,7 @@@ filelayout_read_pagelist(struct nfs_pgi
        /* No multipath support. Use first DS */
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
 -      hdr->ds_idx = idx;
 +      hdr->ds_commit_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
                hdr->args.fh = fh;
        hdr->mds_offset = offset;
  
        /* Perform an asynchronous read to ds */
 -      nfs_initiate_pgio(ds_clnt, hdr,
 -                          &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
 +      nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
 +                        NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
 +                        0, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
  }
  
@@@ -536,16 -591,16 +536,16 @@@ filelayout_write_pagelist(struct nfs_pg
        hdr->pgio_done_cb = filelayout_write_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
 -      hdr->ds_idx = idx;
 +      hdr->ds_commit_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
                hdr->args.fh = fh;
        hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
  
        /* Perform an asynchronous write */
 -      nfs_initiate_pgio(ds_clnt, hdr,
 -                                  &filelayout_write_call_ops, sync,
 -                                  RPC_TASK_SOFTCONN);
 +      nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
 +                        NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
 +                        sync, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
  }
  
@@@ -933,14 -988,12 +933,14 @@@ static const struct nfs_pageio_ops file
        .pg_init = filelayout_pg_init_read,
        .pg_test = filelayout_pg_test,
        .pg_doio = pnfs_generic_pg_readpages,
 +      .pg_cleanup = pnfs_generic_pg_cleanup,
  };
  
  static const struct nfs_pageio_ops filelayout_pg_write_ops = {
        .pg_init = filelayout_pg_init_write,
        .pg_test = filelayout_pg_test,
        .pg_doio = pnfs_generic_pg_writepages,
 +      .pg_cleanup = pnfs_generic_pg_cleanup,
  };
  
  static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
                return j;
  }
  
 -/* The generic layer is about to remove the req from the commit list.
 - * If this will make the bucket empty, it will need to put the lseg reference.
 - * Note this is must be called holding the inode (/cinfo) lock
 - */
 -static void
 -filelayout_clear_request_commit(struct nfs_page *req,
 -                              struct nfs_commit_info *cinfo)
 -{
 -      struct pnfs_layout_segment *freeme = NULL;
 -
 -      if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
 -              goto out;
 -      cinfo->ds->nwritten--;
 -      if (list_is_singular(&req->wb_list)) {
 -              struct pnfs_commit_bucket *bucket;
 -
 -              bucket = list_first_entry(&req->wb_list,
 -                                        struct pnfs_commit_bucket,
 -                                        written);
 -              freeme = bucket->wlseg;
 -              bucket->wlseg = NULL;
 -      }
 -out:
 -      nfs_request_remove_commit_list(req, cinfo);
 -      pnfs_put_lseg_locked(freeme);
 -}
 -
  static void
  filelayout_mark_request_commit(struct nfs_page *req,
                               struct pnfs_layout_segment *lseg,
 -                             struct nfs_commit_info *cinfo)
 +                             struct nfs_commit_info *cinfo,
 +                             u32 ds_commit_idx)
  
  {
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
                 * is normally transferred to the COMMIT call and released
                 * there.  It could also be released if the last req is pulled
                 * off due to a rewrite, in which case it will be done in
 -               * filelayout_clear_request_commit
 +               * pnfs_generic_clear_request_commit
                 */
                buckets[i].wlseg = pnfs_get_lseg(lseg);
        }
@@@ -1002,7 -1081,7 +1002,7 @@@ mds_commit
        spin_unlock(cinfo->lock);
        if (!cinfo->dreq) {
                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-               inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+               inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
                             BDI_RECLAIMABLE);
                __mark_inode_dirty(req->wb_context->dentry->d_inode,
                                   I_DIRTY_DATASYNC);
@@@ -1059,15 -1138,101 +1059,15 @@@ static int filelayout_initiate_commit(s
        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
        if (fh)
                data->args.fh = fh;
 -      return nfs_initiate_commit(ds_clnt, data,
 +      return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
                                   &filelayout_commit_call_ops, how,
                                   RPC_TASK_SOFTCONN);
  out_err:
 -      prepare_to_resend_writes(data);
 -      filelayout_commit_release(data);
 +      pnfs_generic_prepare_to_resend_writes(data);
 +      pnfs_generic_commit_release(data);
        return -EAGAIN;
  }
  
 -static int
 -transfer_commit_list(struct list_head *src, struct list_head *dst,
 -                   struct nfs_commit_info *cinfo, int max)
 -{
 -      struct nfs_page *req, *tmp;
 -      int ret = 0;
 -
 -      list_for_each_entry_safe(req, tmp, src, wb_list) {
 -              if (!nfs_lock_request(req))
 -                      continue;
 -              kref_get(&req->wb_kref);
 -              if (cond_resched_lock(cinfo->lock))
 -                      list_safe_reset_next(req, tmp, wb_list);
 -              nfs_request_remove_commit_list(req, cinfo);
 -              clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 -              nfs_list_add_request(req, dst);
 -              ret++;
 -              if ((ret == max) && !cinfo->dreq)
 -                      break;
 -      }
 -      return ret;
 -}
 -
 -/* Note called with cinfo->lock held. */
 -static int
 -filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 -                             struct nfs_commit_info *cinfo,
 -                             int max)
 -{
 -      struct list_head *src = &bucket->written;
 -      struct list_head *dst = &bucket->committing;
 -      int ret;
 -
 -      ret = transfer_commit_list(src, dst, cinfo, max);
 -      if (ret) {
 -              cinfo->ds->nwritten -= ret;
 -              cinfo->ds->ncommitting += ret;
 -              bucket->clseg = bucket->wlseg;
 -              if (list_empty(src))
 -                      bucket->wlseg = NULL;
 -              else
 -                      pnfs_get_lseg(bucket->clseg);
 -      }
 -      return ret;
 -}
 -
 -/* Move reqs from written to committing lists, returning count of number moved.
 - * Note called with cinfo->lock held.
 - */
 -static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
 -                                      int max)
 -{
 -      int i, rv = 0, cnt;
 -
 -      for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
 -              cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
 -                                                   cinfo, max);
 -              max -= cnt;
 -              rv += cnt;
 -      }
 -      return rv;
 -}
 -
 -/* Pull everything off the committing lists and dump into @dst */
 -static void filelayout_recover_commit_reqs(struct list_head *dst,
 -                                         struct nfs_commit_info *cinfo)
 -{
 -      struct pnfs_commit_bucket *b;
 -      struct pnfs_layout_segment *freeme;
 -      int i;
 -
 -restart:
 -      spin_lock(cinfo->lock);
 -      for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
 -              if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
 -                      freeme = b->wlseg;
 -                      b->wlseg = NULL;
 -                      spin_unlock(cinfo->lock);
 -                      pnfs_put_lseg(freeme);
 -                      goto restart;
 -              }
 -      }
 -      cinfo->ds->nwritten = 0;
 -      spin_unlock(cinfo->lock);
 -}
 -
  /* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
   *                               for @page
   * @cinfo - commit info for current inode
@@@ -1098,14 -1263,108 +1098,14 @@@ filelayout_search_commit_reqs(struct nf
        return NULL;
  }
  
 -static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
 -{
 -      struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
 -      struct pnfs_commit_bucket *bucket;
 -      struct pnfs_layout_segment *freeme;
 -      int i;
 -
 -      for (i = idx; i < fl_cinfo->nbuckets; i++) {
 -              bucket = &fl_cinfo->buckets[i];
 -              if (list_empty(&bucket->committing))
 -                      continue;
 -              nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
 -              spin_lock(cinfo->lock);
 -              freeme = bucket->clseg;
 -              bucket->clseg = NULL;
 -              spin_unlock(cinfo->lock);
 -              pnfs_put_lseg(freeme);
 -      }
 -}
 -
 -static unsigned int
 -alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 -{
 -      struct pnfs_ds_commit_info *fl_cinfo;
 -      struct pnfs_commit_bucket *bucket;
 -      struct nfs_commit_data *data;
 -      int i;
 -      unsigned int nreq = 0;
 -
 -      fl_cinfo = cinfo->ds;
 -      bucket = fl_cinfo->buckets;
 -      for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
 -              if (list_empty(&bucket->committing))
 -                      continue;
 -              data = nfs_commitdata_alloc();
 -              if (!data)
 -                      break;
 -              data->ds_commit_index = i;
 -              spin_lock(cinfo->lock);
 -              data->lseg = bucket->clseg;
 -              bucket->clseg = NULL;
 -              spin_unlock(cinfo->lock);
 -              list_add(&data->pages, list);
 -              nreq++;
 -      }
 -
 -      /* Clean up on error */
 -      filelayout_retry_commit(cinfo, i);
 -      /* Caller will clean up entries put on list */
 -      return nreq;
 -}
 -
 -/* This follows nfs_commit_list pretty closely */
  static int
  filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                           int how, struct nfs_commit_info *cinfo)
  {
 -      struct nfs_commit_data *data, *tmp;
 -      LIST_HEAD(list);
 -      unsigned int nreq = 0;
 -
 -      if (!list_empty(mds_pages)) {
 -              data = nfs_commitdata_alloc();
 -              if (data != NULL) {
 -                      data->lseg = NULL;
 -                      list_add(&data->pages, &list);
 -                      nreq++;
 -              } else {
 -                      nfs_retry_commit(mds_pages, NULL, cinfo);
 -                      filelayout_retry_commit(cinfo, 0);
 -                      cinfo->completion_ops->error_cleanup(NFS_I(inode));
 -                      return -ENOMEM;
 -              }
 -      }
 -
 -      nreq += alloc_ds_commits(cinfo, &list);
 -
 -      if (nreq == 0) {
 -              cinfo->completion_ops->error_cleanup(NFS_I(inode));
 -              goto out;
 -      }
 -
 -      atomic_add(nreq, &cinfo->mds->rpcs_out);
 -
 -      list_for_each_entry_safe(data, tmp, &list, pages) {
 -              list_del_init(&data->pages);
 -              if (!data->lseg) {
 -                      nfs_init_commit(data, mds_pages, NULL, cinfo);
 -                      nfs_initiate_commit(NFS_CLIENT(inode), data,
 -                                          data->mds_ops, how, 0);
 -              } else {
 -                      struct pnfs_commit_bucket *buckets;
 -
 -                      buckets = cinfo->ds->buckets;
 -                      nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
 -                      filelayout_initiate_commit(data, how);
 -              }
 -      }
 -out:
 -      cinfo->ds->ncommitting = 0;
 -      return PNFS_ATTEMPTED;
 +      return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
 +                                          filelayout_initiate_commit);
  }
 +
  static struct nfs4_deviceid_node *
  filelayout_alloc_deviceid_node(struct nfs_server *server,
                struct pnfs_device *pdev, gfp_t gfp_flags)
@@@ -1162,9 -1421,9 +1162,9 @@@ static struct pnfs_layoutdriver_type fi
        .pg_write_ops           = &filelayout_pg_write_ops,
        .get_ds_info            = &filelayout_get_ds_info,
        .mark_request_commit    = filelayout_mark_request_commit,
 -      .clear_request_commit   = filelayout_clear_request_commit,
 -      .scan_commit_lists      = filelayout_scan_commit_lists,
 -      .recover_commit_reqs    = filelayout_recover_commit_reqs,
 +      .clear_request_commit   = pnfs_generic_clear_request_commit,
 +      .scan_commit_lists      = pnfs_generic_scan_commit_lists,
 +      .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
        .search_commit_reqs     = filelayout_search_commit_reqs,
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
index f29fb7d7e8f84ab285230e5e36f10bb125156002,0000000000000000000000000000000000000000..c22ecaa86c1c27cc2138f1853c27757b11104f17
mode 100644,000000..100644
--- /dev/null
@@@ -1,1574 -1,0 +1,1574 @@@
-               inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
 +/*
 + * Module for pnfs flexfile layout driver.
 + *
 + * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
 + *
 + * Tao Peng <bergwolf@primarydata.com>
 + */
 +
 +#include <linux/nfs_fs.h>
 +#include <linux/nfs_page.h>
 +#include <linux/module.h>
 +
 +#include <linux/sunrpc/metrics.h>
 +#include <linux/nfs_idmap.h>
 +
 +#include "flexfilelayout.h"
 +#include "../nfs4session.h"
 +#include "../internal.h"
 +#include "../delegation.h"
 +#include "../nfs4trace.h"
 +#include "../iostat.h"
 +#include "../nfs.h"
 +
 +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 +
 +#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
 +
 +static struct pnfs_layout_hdr *
 +ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 +{
 +      struct nfs4_flexfile_layout *ffl;
 +
 +      ffl = kzalloc(sizeof(*ffl), gfp_flags);
 +      if (ffl) {
 +              INIT_LIST_HEAD(&ffl->error_list);
 +              return &ffl->generic_hdr;
 +      } else
 +              return NULL;
 +}
 +
 +static void
 +ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 +{
 +      struct nfs4_ff_layout_ds_err *err, *n;
 +
 +      list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
 +                               list) {
 +              list_del(&err->list);
 +              kfree(err);
 +      }
 +      kfree(FF_LAYOUT_FROM_HDR(lo));
 +}
 +
 +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 +{
 +      __be32 *p;
 +
 +      p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
 +      if (unlikely(p == NULL))
 +              return -ENOBUFS;
 +      memcpy(stateid, p, NFS4_STATEID_SIZE);
 +      dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
 +              p[0], p[1], p[2], p[3]);
 +      return 0;
 +}
 +
 +static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
 +{
 +      __be32 *p;
 +
 +      p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
 +      if (unlikely(!p))
 +              return -ENOBUFS;
 +      memcpy(devid, p, NFS4_DEVICEID4_SIZE);
 +      nfs4_print_deviceid(devid);
 +      return 0;
 +}
 +
 +static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
 +{
 +      __be32 *p;
 +
 +      p = xdr_inline_decode(xdr, 4);
 +      if (unlikely(!p))
 +              return -ENOBUFS;
 +      fh->size = be32_to_cpup(p++);
 +      if (fh->size > sizeof(struct nfs_fh)) {
 +              printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
 +                     fh->size);
 +              return -EOVERFLOW;
 +      }
 +      /* fh.data */
 +      p = xdr_inline_decode(xdr, fh->size);
 +      if (unlikely(!p))
 +              return -ENOBUFS;
 +      memcpy(&fh->data, p, fh->size);
 +      dprintk("%s: fh len %d\n", __func__, fh->size);
 +
 +      return 0;
 +}
 +
 +/*
 + * Currently only stringified uids and gids are accepted.
 + * I.e., kerberos is not supported to the DSes, so no pricipals.
 + *
 + * That means that one common function will suffice, but when
 + * principals are added, this should be split to accomodate
 + * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
 + */
 +static int
 +decode_name(struct xdr_stream *xdr, u32 *id)
 +{
 +      __be32 *p;
 +      int len;
 +
 +      /* opaque_length(4)*/
 +      p = xdr_inline_decode(xdr, 4);
 +      if (unlikely(!p))
 +              return -ENOBUFS;
 +      len = be32_to_cpup(p++);
 +      if (len < 0)
 +              return -EINVAL;
 +
 +      dprintk("%s: len %u\n", __func__, len);
 +
 +      /* opaque body */
 +      p = xdr_inline_decode(xdr, len);
 +      if (unlikely(!p))
 +              return -ENOBUFS;
 +
 +      if (!nfs_map_string_to_numeric((char *)p, len, id))
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +
 +static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 +{
 +      int i;
 +
 +      if (fls->mirror_array) {
 +              for (i = 0; i < fls->mirror_array_cnt; i++) {
 +                      /* normally mirror_ds is freed in
 +                       * .free_deviceid_node but we still do it here
 +                       * for .alloc_lseg error path */
 +                      if (fls->mirror_array[i]) {
 +                              kfree(fls->mirror_array[i]->fh_versions);
 +                              nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
 +                              kfree(fls->mirror_array[i]);
 +                      }
 +              }
 +              kfree(fls->mirror_array);
 +              fls->mirror_array = NULL;
 +      }
 +}
 +
 +static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
 +{
 +      int ret = 0;
 +
 +      dprintk("--> %s\n", __func__);
 +
 +      /* FIXME: remove this check when layout segment support is added */
 +      if (lgr->range.offset != 0 ||
 +          lgr->range.length != NFS4_MAX_UINT64) {
 +              dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
 +                      __func__);
 +              ret = -EINVAL;
 +      }
 +
 +      dprintk("--> %s returns %d\n", __func__, ret);
 +      return ret;
 +}
 +
 +static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 +{
 +      if (fls) {
 +              ff_layout_free_mirror_array(fls);
 +              kfree(fls);
 +      }
 +}
 +
 +static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 +{
 +      struct nfs4_ff_layout_mirror *tmp;
 +      int i, j;
 +
 +      for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
 +              for (j = i + 1; j < fls->mirror_array_cnt; j++)
 +                      if (fls->mirror_array[i]->efficiency <
 +                          fls->mirror_array[j]->efficiency) {
 +                              tmp = fls->mirror_array[i];
 +                              fls->mirror_array[i] = fls->mirror_array[j];
 +                              fls->mirror_array[j] = tmp;
 +                      }
 +      }
 +}
 +
 +static struct pnfs_layout_segment *
 +ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 +                   struct nfs4_layoutget_res *lgr,
 +                   gfp_t gfp_flags)
 +{
 +      struct pnfs_layout_segment *ret;
 +      struct nfs4_ff_layout_segment *fls = NULL;
 +      struct xdr_stream stream;
 +      struct xdr_buf buf;
 +      struct page *scratch;
 +      u64 stripe_unit;
 +      u32 mirror_array_cnt;
 +      __be32 *p;
 +      int i, rc;
 +
 +      dprintk("--> %s\n", __func__);
 +      scratch = alloc_page(gfp_flags);
 +      if (!scratch)
 +              return ERR_PTR(-ENOMEM);
 +
 +      xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
 +                            lgr->layoutp->len);
 +      xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 +
 +      /* stripe unit and mirror_array_cnt */
 +      rc = -EIO;
 +      p = xdr_inline_decode(&stream, 8 + 4);
 +      if (!p)
 +              goto out_err_free;
 +
 +      p = xdr_decode_hyper(p, &stripe_unit);
 +      mirror_array_cnt = be32_to_cpup(p++);
 +      dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
 +              stripe_unit, mirror_array_cnt);
 +
 +      if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
 +          mirror_array_cnt == 0)
 +              goto out_err_free;
 +
 +      rc = -ENOMEM;
 +      fls = kzalloc(sizeof(*fls), gfp_flags);
 +      if (!fls)
 +              goto out_err_free;
 +
 +      fls->mirror_array_cnt = mirror_array_cnt;
 +      fls->stripe_unit = stripe_unit;
 +      fls->mirror_array = kcalloc(fls->mirror_array_cnt,
 +                                  sizeof(fls->mirror_array[0]), gfp_flags);
 +      if (fls->mirror_array == NULL)
 +              goto out_err_free;
 +
 +      for (i = 0; i < fls->mirror_array_cnt; i++) {
 +              struct nfs4_deviceid devid;
 +              struct nfs4_deviceid_node *idnode;
 +              u32 ds_count;
 +              u32 fh_count;
 +              int j;
 +
 +              rc = -EIO;
 +              p = xdr_inline_decode(&stream, 4);
 +              if (!p)
 +                      goto out_err_free;
 +              ds_count = be32_to_cpup(p);
 +
 +              /* FIXME: allow for striping? */
 +              if (ds_count != 1)
 +                      goto out_err_free;
 +
 +              fls->mirror_array[i] =
 +                      kzalloc(sizeof(struct nfs4_ff_layout_mirror),
 +                              gfp_flags);
 +              if (fls->mirror_array[i] == NULL) {
 +                      rc = -ENOMEM;
 +                      goto out_err_free;
 +              }
 +
 +              spin_lock_init(&fls->mirror_array[i]->lock);
 +              fls->mirror_array[i]->ds_count = ds_count;
 +
 +              /* deviceid */
 +              rc = decode_deviceid(&stream, &devid);
 +              if (rc)
 +                      goto out_err_free;
 +
 +              idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
 +                                              &devid, lh->plh_lc_cred,
 +                                              gfp_flags);
 +              /*
 +               * upon success, mirror_ds is allocated by previous
 +               * getdeviceinfo, or newly by .alloc_deviceid_node
 +               * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
 +               */
 +              if (idnode)
 +                      fls->mirror_array[i]->mirror_ds =
 +                              FF_LAYOUT_MIRROR_DS(idnode);
 +              else
 +                      goto out_err_free;
 +
 +              /* efficiency */
 +              rc = -EIO;
 +              p = xdr_inline_decode(&stream, 4);
 +              if (!p)
 +                      goto out_err_free;
 +              fls->mirror_array[i]->efficiency = be32_to_cpup(p);
 +
 +              /* stateid */
 +              rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
 +              if (rc)
 +                      goto out_err_free;
 +
 +              /* fh */
 +              p = xdr_inline_decode(&stream, 4);
 +              if (!p)
 +                      goto out_err_free;
 +              fh_count = be32_to_cpup(p);
 +
 +              fls->mirror_array[i]->fh_versions =
 +                      kzalloc(fh_count * sizeof(struct nfs_fh),
 +                              gfp_flags);
 +              if (fls->mirror_array[i]->fh_versions == NULL) {
 +                      rc = -ENOMEM;
 +                      goto out_err_free;
 +              }
 +
 +              for (j = 0; j < fh_count; j++) {
 +                      rc = decode_nfs_fh(&stream,
 +                                         &fls->mirror_array[i]->fh_versions[j]);
 +                      if (rc)
 +                              goto out_err_free;
 +              }
 +
 +              fls->mirror_array[i]->fh_versions_cnt = fh_count;
 +
 +              /* user */
 +              rc = decode_name(&stream, &fls->mirror_array[i]->uid);
 +              if (rc)
 +                      goto out_err_free;
 +
 +              /* group */
 +              rc = decode_name(&stream, &fls->mirror_array[i]->gid);
 +              if (rc)
 +                      goto out_err_free;
 +
 +              dprintk("%s: uid %d gid %d\n", __func__,
 +                      fls->mirror_array[i]->uid,
 +                      fls->mirror_array[i]->gid);
 +      }
 +
 +      ff_layout_sort_mirrors(fls);
 +      rc = ff_layout_check_layout(lgr);
 +      if (rc)
 +              goto out_err_free;
 +
 +      ret = &fls->generic_hdr;
 +      dprintk("<-- %s (success)\n", __func__);
 +out_free_page:
 +      __free_page(scratch);
 +      return ret;
 +out_err_free:
 +      _ff_layout_free_lseg(fls);
 +      ret = ERR_PTR(rc);
 +      dprintk("<-- %s (%d)\n", __func__, rc);
 +      goto out_free_page;
 +}
 +
 +static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
 +{
 +      struct pnfs_layout_segment *lseg;
 +
 +      list_for_each_entry(lseg, &layout->plh_segs, pls_list)
 +              if (lseg->pls_range.iomode == IOMODE_RW)
 +                      return true;
 +
 +      return false;
 +}
 +
 +static void
 +ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 +{
 +      struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 +      int i;
 +
 +      dprintk("--> %s\n", __func__);
 +
 +      for (i = 0; i < fls->mirror_array_cnt; i++) {
 +              if (fls->mirror_array[i]) {
 +                      nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
 +                      fls->mirror_array[i]->mirror_ds = NULL;
 +                      if (fls->mirror_array[i]->cred) {
 +                              put_rpccred(fls->mirror_array[i]->cred);
 +                              fls->mirror_array[i]->cred = NULL;
 +                      }
 +              }
 +      }
 +
 +      if (lseg->pls_range.iomode == IOMODE_RW) {
 +              struct nfs4_flexfile_layout *ffl;
 +              struct inode *inode;
 +
 +              ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
 +              inode = ffl->generic_hdr.plh_inode;
 +              spin_lock(&inode->i_lock);
 +              if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
 +                      ffl->commit_info.nbuckets = 0;
 +                      kfree(ffl->commit_info.buckets);
 +                      ffl->commit_info.buckets = NULL;
 +              }
 +              spin_unlock(&inode->i_lock);
 +      }
 +      _ff_layout_free_lseg(fls);
 +}
 +
 +/* Return 1 until we have multiple lsegs support */
 +static int
 +ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
 +{
 +      return 1;
 +}
 +
 +static int
 +ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 +                          struct nfs_commit_info *cinfo,
 +                          gfp_t gfp_flags)
 +{
 +      struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 +      struct pnfs_commit_bucket *buckets;
 +      int size;
 +
 +      if (cinfo->ds->nbuckets != 0) {
 +              /* This assumes there is only one RW lseg per file.
 +               * To support multiple lseg per file, we need to
 +               * change struct pnfs_commit_bucket to allow dynamic
 +               * increasing nbuckets.
 +               */
 +              return 0;
 +      }
 +
 +      size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
 +
 +      buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
 +                        gfp_flags);
 +      if (!buckets)
 +              return -ENOMEM;
 +      else {
 +              int i;
 +
 +              spin_lock(cinfo->lock);
 +              if (cinfo->ds->nbuckets != 0)
 +                      kfree(buckets);
 +              else {
 +                      cinfo->ds->buckets = buckets;
 +                      cinfo->ds->nbuckets = size;
 +                      for (i = 0; i < size; i++) {
 +                              INIT_LIST_HEAD(&buckets[i].written);
 +                              INIT_LIST_HEAD(&buckets[i].committing);
 +                              /* mark direct verifier as unset */
 +                              buckets[i].direct_verf.committed =
 +                                      NFS_INVALID_STABLE_HOW;
 +                      }
 +              }
 +              spin_unlock(cinfo->lock);
 +              return 0;
 +      }
 +}
 +
 +static struct nfs4_pnfs_ds *
 +ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
 +                                int *best_idx)
 +{
 +      struct nfs4_ff_layout_segment *fls;
 +      struct nfs4_pnfs_ds *ds;
 +      int idx;
 +
 +      fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
 +      /* mirrors are sorted by efficiency */
 +      for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
 +              ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
 +              if (ds) {
 +                      *best_idx = idx;
 +                      return ds;
 +              }
 +      }
 +
 +      return NULL;
 +}
 +
 +static void
 +ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 +                      struct nfs_page *req)
 +{
 +      struct nfs_pgio_mirror *pgm;
 +      struct nfs4_ff_layout_mirror *mirror;
 +      struct nfs4_pnfs_ds *ds;
 +      int ds_idx;
 +
 +      /* Use full layout for now */
 +      if (!pgio->pg_lseg)
 +              pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 +                                                 req->wb_context,
 +                                                 0,
 +                                                 NFS4_MAX_UINT64,
 +                                                 IOMODE_READ,
 +                                                 GFP_KERNEL);
 +      /* If no lseg, fall back to read through mds */
 +      if (pgio->pg_lseg == NULL)
 +              goto out_mds;
 +
 +      ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
 +      if (!ds)
 +              goto out_mds;
 +      mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
 +
 +      pgio->pg_mirror_idx = ds_idx;
 +
 +      /* read always uses only one mirror - idx 0 for pgio layer */
 +      pgm = &pgio->pg_mirrors[0];
 +      pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
 +
 +      return;
 +out_mds:
 +      pnfs_put_lseg(pgio->pg_lseg);
 +      pgio->pg_lseg = NULL;
 +      nfs_pageio_reset_read_mds(pgio);
 +}
 +
 +static void
 +ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 +                      struct nfs_page *req)
 +{
 +      struct nfs4_ff_layout_mirror *mirror;
 +      struct nfs_pgio_mirror *pgm;
 +      struct nfs_commit_info cinfo;
 +      struct nfs4_pnfs_ds *ds;
 +      int i;
 +      int status;
 +
 +      if (!pgio->pg_lseg)
 +              pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 +                                                 req->wb_context,
 +                                                 0,
 +                                                 NFS4_MAX_UINT64,
 +                                                 IOMODE_RW,
 +                                                 GFP_NOFS);
 +      /* If no lseg, fall back to write through mds */
 +      if (pgio->pg_lseg == NULL)
 +              goto out_mds;
 +
 +      nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
 +      status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
 +      if (status < 0)
 +              goto out_mds;
 +
 +      /* Use a direct mapping of ds_idx to pgio mirror_idx */
 +      if (WARN_ON_ONCE(pgio->pg_mirror_count !=
 +          FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
 +              goto out_mds;
 +
 +      for (i = 0; i < pgio->pg_mirror_count; i++) {
 +              ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
 +              if (!ds)
 +                      goto out_mds;
 +              pgm = &pgio->pg_mirrors[i];
 +              mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
 +              pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
 +      }
 +
 +      return;
 +
 +out_mds:
 +      pnfs_put_lseg(pgio->pg_lseg);
 +      pgio->pg_lseg = NULL;
 +      nfs_pageio_reset_write_mds(pgio);
 +}
 +
 +static unsigned int
 +ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 +                                  struct nfs_page *req)
 +{
 +      if (!pgio->pg_lseg)
 +              pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 +                                                 req->wb_context,
 +                                                 0,
 +                                                 NFS4_MAX_UINT64,
 +                                                 IOMODE_RW,
 +                                                 GFP_NOFS);
 +      if (pgio->pg_lseg)
 +              return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 +
 +      /* no lseg means that pnfs is not in use, so no mirroring here */
 +      pnfs_put_lseg(pgio->pg_lseg);
 +      pgio->pg_lseg = NULL;
 +      nfs_pageio_reset_write_mds(pgio);
 +      return 1;
 +}
 +
 +static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
 +      .pg_init = ff_layout_pg_init_read,
 +      .pg_test = pnfs_generic_pg_test,
 +      .pg_doio = pnfs_generic_pg_readpages,
 +      .pg_cleanup = pnfs_generic_pg_cleanup,
 +};
 +
 +static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
 +      .pg_init = ff_layout_pg_init_write,
 +      .pg_test = pnfs_generic_pg_test,
 +      .pg_doio = pnfs_generic_pg_writepages,
 +      .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
 +      .pg_cleanup = pnfs_generic_pg_cleanup,
 +};
 +
 +static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
 +{
 +      struct rpc_task *task = &hdr->task;
 +
 +      pnfs_layoutcommit_inode(hdr->inode, false);
 +
 +      if (retry_pnfs) {
 +              dprintk("%s Reset task %5u for i/o through pNFS "
 +                      "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 +                      hdr->task.tk_pid,
 +                      hdr->inode->i_sb->s_id,
 +                      (unsigned long long)NFS_FILEID(hdr->inode),
 +                      hdr->args.count,
 +                      (unsigned long long)hdr->args.offset);
 +
 +              if (!hdr->dreq) {
 +                      struct nfs_open_context *ctx;
 +
 +                      ctx = nfs_list_entry(hdr->pages.next)->wb_context;
 +                      set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
 +                      hdr->completion_ops->error_cleanup(&hdr->pages);
 +              } else {
 +                      nfs_direct_set_resched_writes(hdr->dreq);
 +                      /* fake unstable write to let common nfs resend pages */
 +                      hdr->verf.committed = NFS_UNSTABLE;
 +                      hdr->good_bytes = 0;
 +              }
 +              return;
 +      }
 +
 +      if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 +              dprintk("%s Reset task %5u for i/o through MDS "
 +                      "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 +                      hdr->task.tk_pid,
 +                      hdr->inode->i_sb->s_id,
 +                      (unsigned long long)NFS_FILEID(hdr->inode),
 +                      hdr->args.count,
 +                      (unsigned long long)hdr->args.offset);
 +
 +              task->tk_status = pnfs_write_done_resend_to_mds(hdr);
 +      }
 +}
 +
 +static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
 +{
 +      struct rpc_task *task = &hdr->task;
 +
 +      pnfs_layoutcommit_inode(hdr->inode, false);
 +
 +      if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 +              dprintk("%s Reset task %5u for i/o through MDS "
 +                      "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 +                      hdr->task.tk_pid,
 +                      hdr->inode->i_sb->s_id,
 +                      (unsigned long long)NFS_FILEID(hdr->inode),
 +                      hdr->args.count,
 +                      (unsigned long long)hdr->args.offset);
 +
 +              task->tk_status = pnfs_read_done_resend_to_mds(hdr);
 +      }
 +}
 +
 +static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 +                                         struct nfs4_state *state,
 +                                         struct nfs_client *clp,
 +                                         struct pnfs_layout_segment *lseg,
 +                                         int idx)
 +{
 +      struct pnfs_layout_hdr *lo = lseg->pls_layout;
 +      struct inode *inode = lo->plh_inode;
 +      struct nfs_server *mds_server = NFS_SERVER(inode);
 +
 +      struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
 +      struct nfs_client *mds_client = mds_server->nfs_client;
 +      struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
 +
 +      if (task->tk_status >= 0)
 +              return 0;
 +
 +      switch (task->tk_status) {
 +      /* MDS state errors */
 +      case -NFS4ERR_DELEG_REVOKED:
 +      case -NFS4ERR_ADMIN_REVOKED:
 +      case -NFS4ERR_BAD_STATEID:
 +              if (state == NULL)
 +                      break;
 +              nfs_remove_bad_delegation(state->inode);
 +      case -NFS4ERR_OPENMODE:
 +              if (state == NULL)
 +                      break;
 +              if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
 +                      goto out_bad_stateid;
 +              goto wait_on_recovery;
 +      case -NFS4ERR_EXPIRED:
 +              if (state != NULL) {
 +                      if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
 +                              goto out_bad_stateid;
 +              }
 +              nfs4_schedule_lease_recovery(mds_client);
 +              goto wait_on_recovery;
 +      /* DS session errors */
 +      case -NFS4ERR_BADSESSION:
 +      case -NFS4ERR_BADSLOT:
 +      case -NFS4ERR_BAD_HIGH_SLOT:
 +      case -NFS4ERR_DEADSESSION:
 +      case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 +      case -NFS4ERR_SEQ_FALSE_RETRY:
 +      case -NFS4ERR_SEQ_MISORDERED:
 +              dprintk("%s ERROR %d, Reset session. Exchangeid "
 +                      "flags 0x%x\n", __func__, task->tk_status,
 +                      clp->cl_exchange_flags);
 +              nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
 +              break;
 +      case -NFS4ERR_DELAY:
 +      case -NFS4ERR_GRACE:
 +              rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
 +              break;
 +      case -NFS4ERR_RETRY_UNCACHED_REP:
 +              break;
 +      /* Invalidate Layout errors */
 +      case -NFS4ERR_PNFS_NO_LAYOUT:
 +      case -ESTALE:           /* mapped NFS4ERR_STALE */
 +      case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
 +      case -EISDIR:           /* mapped NFS4ERR_ISDIR */
 +      case -NFS4ERR_FHEXPIRED:
 +      case -NFS4ERR_WRONG_TYPE:
 +              dprintk("%s Invalid layout error %d\n", __func__,
 +                      task->tk_status);
 +              /*
 +               * Destroy layout so new i/o will get a new layout.
 +               * Layout will not be destroyed until all current lseg
 +               * references are put. Mark layout as invalid to resend failed
 +               * i/o and all i/o waiting on the slot table to the MDS until
 +               * layout is destroyed and a new valid layout is obtained.
 +               */
 +              pnfs_destroy_layout(NFS_I(inode));
 +              rpc_wake_up(&tbl->slot_tbl_waitq);
 +              goto reset;
 +      /* RPC connection errors */
 +      case -ECONNREFUSED:
 +      case -EHOSTDOWN:
 +      case -EHOSTUNREACH:
 +      case -ENETUNREACH:
 +      case -EIO:
 +      case -ETIMEDOUT:
 +      case -EPIPE:
 +              dprintk("%s DS connection error %d\n", __func__,
 +                      task->tk_status);
 +              nfs4_mark_deviceid_unavailable(devid);
 +              rpc_wake_up(&tbl->slot_tbl_waitq);
 +              /* fall through */
 +      default:
 +              if (ff_layout_has_available_ds(lseg))
 +                      return -NFS4ERR_RESET_TO_PNFS;
 +reset:
 +              dprintk("%s Retry through MDS. Error %d\n", __func__,
 +                      task->tk_status);
 +              return -NFS4ERR_RESET_TO_MDS;
 +      }
 +out:
 +      task->tk_status = 0;
 +      return -EAGAIN;
 +out_bad_stateid:
 +      task->tk_status = -EIO;
 +      return 0;
 +wait_on_recovery:
 +      rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
 +      if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
 +              rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
 +      goto out;
 +}
 +
 +/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
 +static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 +                                         struct pnfs_layout_segment *lseg,
 +                                         int idx)
 +{
 +      struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
 +
 +      if (task->tk_status >= 0)
 +              return 0;
 +
 +      if (task->tk_status != -EJUKEBOX) {
 +              dprintk("%s DS connection error %d\n", __func__,
 +                      task->tk_status);
 +              nfs4_mark_deviceid_unavailable(devid);
 +              if (ff_layout_has_available_ds(lseg))
 +                      return -NFS4ERR_RESET_TO_PNFS;
 +              else
 +                      return -NFS4ERR_RESET_TO_MDS;
 +      }
 +
 +      if (task->tk_status == -EJUKEBOX)
 +              nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
 +      task->tk_status = 0;
 +      rpc_restart_call(task);
 +      rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
 +      return -EAGAIN;
 +}
 +
 +static int ff_layout_async_handle_error(struct rpc_task *task,
 +                                      struct nfs4_state *state,
 +                                      struct nfs_client *clp,
 +                                      struct pnfs_layout_segment *lseg,
 +                                      int idx)
 +{
 +      int vers = clp->cl_nfs_mod->rpc_vers->number;
 +
 +      switch (vers) {
 +      case 3:
 +              return ff_layout_async_handle_error_v3(task, lseg, idx);
 +      case 4:
 +              return ff_layout_async_handle_error_v4(task, state, clp,
 +                                                     lseg, idx);
 +      default:
 +              /* should never happen */
 +              WARN_ON_ONCE(1);
 +              return 0;
 +      }
 +}
 +
 +static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 +                                      int idx, u64 offset, u64 length,
 +                                      u32 status, int opnum)
 +{
 +      struct nfs4_ff_layout_mirror *mirror;
 +      int err;
 +
 +      mirror = FF_LAYOUT_COMP(lseg, idx);
 +      err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 +                                     mirror, offset, length, status, opnum,
 +                                     GFP_NOIO);
 +      dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 +}
 +
 +/* NFS_PROTO call done callback routines */
 +
 +static int ff_layout_read_done_cb(struct rpc_task *task,
 +                              struct nfs_pgio_header *hdr)
 +{
 +      struct inode *inode;
 +      int err;
 +
 +      trace_nfs4_pnfs_read(hdr, task->tk_status);
 +      if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
 +              hdr->res.op_status = NFS4ERR_NXIO;
 +      if (task->tk_status < 0 && hdr->res.op_status)
 +              ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 +                                          hdr->args.offset, hdr->args.count,
 +                                          hdr->res.op_status, OP_READ);
 +      err = ff_layout_async_handle_error(task, hdr->args.context->state,
 +                                         hdr->ds_clp, hdr->lseg,
 +                                         hdr->pgio_mirror_idx);
 +
 +      switch (err) {
 +      case -NFS4ERR_RESET_TO_PNFS:
 +              set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
 +                      &hdr->lseg->pls_layout->plh_flags);
 +              pnfs_read_resend_pnfs(hdr);
 +              return task->tk_status;
 +      case -NFS4ERR_RESET_TO_MDS:
 +              inode = hdr->lseg->pls_layout->plh_inode;
 +              pnfs_error_mark_layout_for_return(inode, hdr->lseg);
 +              ff_layout_reset_read(hdr);
 +              return task->tk_status;
 +      case -EAGAIN:
 +              rpc_restart_call_prepare(task);
 +              return -EAGAIN;
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * We reference the rpc_cred of the first WRITE that triggers the need for
 + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
 + * rfc5661 is not clear about which credential should be used.
 + *
 + * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
 + * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
 + * we always send layoutcommit after DS writes.
 + */
 +static void
 +ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 +{
 +      pnfs_set_layoutcommit(hdr);
 +      dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
 +              (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 +}
 +
 +static bool
 +ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 +{
 +      /* No mirroring for now */
 +      struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
 +
 +      return ff_layout_test_devid_unavailable(node);
 +}
 +
 +static int ff_layout_read_prepare_common(struct rpc_task *task,
 +                                       struct nfs_pgio_header *hdr)
 +{
 +      if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 +              rpc_exit(task, -EIO);
 +              return -EIO;
 +      }
 +      if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
 +              dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
 +              if (ff_layout_has_available_ds(hdr->lseg))
 +                      pnfs_read_resend_pnfs(hdr);
 +              else
 +                      ff_layout_reset_read(hdr);
 +              rpc_exit(task, 0);
 +              return -EAGAIN;
 +      }
 +      hdr->pgio_done_cb = ff_layout_read_done_cb;
 +
 +      return 0;
 +}
 +
 +/*
 + * Call ops for the async read/write cases
 + * In the case of dense layouts, the offset needs to be reset to its
 + * original value.
 + */
 +static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      if (ff_layout_read_prepare_common(task, hdr))
 +              return;
 +
 +      rpc_call_start(task);
 +}
 +
 +static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
 +                                  struct nfs4_sequence_args *args,
 +                                  struct nfs4_sequence_res *res,
 +                                  struct rpc_task *task)
 +{
 +      if (ds_clp->cl_session)
 +              return nfs41_setup_sequence(ds_clp->cl_session,
 +                                         args,
 +                                         res,
 +                                         task);
 +      return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
 +                                 args,
 +                                 res,
 +                                 task);
 +}
 +
 +static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      if (ff_layout_read_prepare_common(task, hdr))
 +              return;
 +
 +      if (ff_layout_setup_sequence(hdr->ds_clp,
 +                                   &hdr->args.seq_args,
 +                                   &hdr->res.seq_res,
 +                                   task))
 +              return;
 +
 +      if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
 +                      hdr->args.lock_context, FMODE_READ) == -EIO)
 +              rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 +}
 +
 +static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 +
 +      if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 +          task->tk_status == 0) {
 +              nfs4_sequence_done(task, &hdr->res.seq_res);
 +              return;
 +      }
 +
 +      /* Note this may cause RPC to be resent */
 +      hdr->mds_ops->rpc_call_done(task, hdr);
 +}
 +
 +static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      rpc_count_iostats_metrics(task,
 +          &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
 +}
 +
 +static int ff_layout_write_done_cb(struct rpc_task *task,
 +                              struct nfs_pgio_header *hdr)
 +{
 +      struct inode *inode;
 +      int err;
 +
 +      trace_nfs4_pnfs_write(hdr, task->tk_status);
 +      if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
 +              hdr->res.op_status = NFS4ERR_NXIO;
 +      if (task->tk_status < 0 && hdr->res.op_status)
 +              ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
 +                                          hdr->args.offset, hdr->args.count,
 +                                          hdr->res.op_status, OP_WRITE);
 +      err = ff_layout_async_handle_error(task, hdr->args.context->state,
 +                                         hdr->ds_clp, hdr->lseg,
 +                                         hdr->pgio_mirror_idx);
 +
 +      switch (err) {
 +      case -NFS4ERR_RESET_TO_PNFS:
 +      case -NFS4ERR_RESET_TO_MDS:
 +              inode = hdr->lseg->pls_layout->plh_inode;
 +              pnfs_error_mark_layout_for_return(inode, hdr->lseg);
 +              if (err == -NFS4ERR_RESET_TO_PNFS) {
 +                      pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
 +                      ff_layout_reset_write(hdr, true);
 +              } else {
 +                      pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
 +                      ff_layout_reset_write(hdr, false);
 +              }
 +              return task->tk_status;
 +      case -EAGAIN:
 +              rpc_restart_call_prepare(task);
 +              return -EAGAIN;
 +      }
 +
 +      if (hdr->res.verf->committed == NFS_FILE_SYNC ||
 +          hdr->res.verf->committed == NFS_DATA_SYNC)
 +              ff_layout_set_layoutcommit(hdr);
 +
 +      return 0;
 +}
 +
 +static int ff_layout_commit_done_cb(struct rpc_task *task,
 +                                   struct nfs_commit_data *data)
 +{
 +      struct inode *inode;
 +      int err;
 +
 +      trace_nfs4_pnfs_commit_ds(data, task->tk_status);
 +      if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
 +              data->res.op_status = NFS4ERR_NXIO;
 +      if (task->tk_status < 0 && data->res.op_status)
 +              ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
 +                                          data->args.offset, data->args.count,
 +                                          data->res.op_status, OP_COMMIT);
 +      err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
 +                                         data->lseg, data->ds_commit_index);
 +
 +      switch (err) {
 +      case -NFS4ERR_RESET_TO_PNFS:
 +      case -NFS4ERR_RESET_TO_MDS:
 +              inode = data->lseg->pls_layout->plh_inode;
 +              pnfs_error_mark_layout_for_return(inode, data->lseg);
 +              if (err == -NFS4ERR_RESET_TO_PNFS)
 +                      pnfs_set_retry_layoutget(data->lseg->pls_layout);
 +              else
 +                      pnfs_clear_retry_layoutget(data->lseg->pls_layout);
 +              pnfs_generic_prepare_to_resend_writes(data);
 +              return -EAGAIN;
 +      case -EAGAIN:
 +              rpc_restart_call_prepare(task);
 +              return -EAGAIN;
 +      }
 +
 +      if (data->verf.committed == NFS_UNSTABLE)
 +              pnfs_commit_set_layoutcommit(data);
 +
 +      return 0;
 +}
 +
 +static int ff_layout_write_prepare_common(struct rpc_task *task,
 +                                        struct nfs_pgio_header *hdr)
 +{
 +      if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 +              rpc_exit(task, -EIO);
 +              return -EIO;
 +      }
 +
 +      if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
 +              bool retry_pnfs;
 +
 +              retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
 +              dprintk("%s task %u reset io to %s\n", __func__,
 +                      task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
 +              ff_layout_reset_write(hdr, retry_pnfs);
 +              rpc_exit(task, 0);
 +              return -EAGAIN;
 +      }
 +
 +      return 0;
 +}
 +
 +static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      if (ff_layout_write_prepare_common(task, hdr))
 +              return;
 +
 +      rpc_call_start(task);
 +}
 +
 +static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      if (ff_layout_write_prepare_common(task, hdr))
 +              return;
 +
 +      if (ff_layout_setup_sequence(hdr->ds_clp,
 +                                   &hdr->args.seq_args,
 +                                   &hdr->res.seq_res,
 +                                   task))
 +              return;
 +
 +      if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
 +                      hdr->args.lock_context, FMODE_WRITE) == -EIO)
 +              rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 +}
 +
 +static void ff_layout_write_call_done(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 +          task->tk_status == 0) {
 +              nfs4_sequence_done(task, &hdr->res.seq_res);
 +              return;
 +      }
 +
 +      /* Note this may cause RPC to be resent */
 +      hdr->mds_ops->rpc_call_done(task, hdr);
 +}
 +
 +static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
 +{
 +      struct nfs_pgio_header *hdr = data;
 +
 +      rpc_count_iostats_metrics(task,
 +          &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
 +}
 +
 +static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 +{
 +      rpc_call_start(task);
 +}
 +
 +static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
 +{
 +      struct nfs_commit_data *wdata = data;
 +
 +      ff_layout_setup_sequence(wdata->ds_clp,
 +                               &wdata->args.seq_args,
 +                               &wdata->res.seq_res,
 +                               task);
 +}
 +
 +static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
 +{
 +      struct nfs_commit_data *cdata = data;
 +
 +      rpc_count_iostats_metrics(task,
 +          &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
 +}
 +
 +static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
 +      .rpc_call_prepare = ff_layout_read_prepare_v3,
 +      .rpc_call_done = ff_layout_read_call_done,
 +      .rpc_count_stats = ff_layout_read_count_stats,
 +      .rpc_release = pnfs_generic_rw_release,
 +};
 +
 +static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
 +      .rpc_call_prepare = ff_layout_read_prepare_v4,
 +      .rpc_call_done = ff_layout_read_call_done,
 +      .rpc_count_stats = ff_layout_read_count_stats,
 +      .rpc_release = pnfs_generic_rw_release,
 +};
 +
 +static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
 +      .rpc_call_prepare = ff_layout_write_prepare_v3,
 +      .rpc_call_done = ff_layout_write_call_done,
 +      .rpc_count_stats = ff_layout_write_count_stats,
 +      .rpc_release = pnfs_generic_rw_release,
 +};
 +
 +static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
 +      .rpc_call_prepare = ff_layout_write_prepare_v4,
 +      .rpc_call_done = ff_layout_write_call_done,
 +      .rpc_count_stats = ff_layout_write_count_stats,
 +      .rpc_release = pnfs_generic_rw_release,
 +};
 +
 +static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
 +      .rpc_call_prepare = ff_layout_commit_prepare_v3,
 +      .rpc_call_done = pnfs_generic_write_commit_done,
 +      .rpc_count_stats = ff_layout_commit_count_stats,
 +      .rpc_release = pnfs_generic_commit_release,
 +};
 +
 +static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
 +      .rpc_call_prepare = ff_layout_commit_prepare_v4,
 +      .rpc_call_done = pnfs_generic_write_commit_done,
 +      .rpc_count_stats = ff_layout_commit_count_stats,
 +      .rpc_release = pnfs_generic_commit_release,
 +};
 +
 +static enum pnfs_try_status
 +ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 +{
 +      struct pnfs_layout_segment *lseg = hdr->lseg;
 +      struct nfs4_pnfs_ds *ds;
 +      struct rpc_clnt *ds_clnt;
 +      struct rpc_cred *ds_cred;
 +      loff_t offset = hdr->args.offset;
 +      u32 idx = hdr->pgio_mirror_idx;
 +      int vers;
 +      struct nfs_fh *fh;
 +
 +      dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
 +              __func__, hdr->inode->i_ino,
 +              hdr->args.pgbase, (size_t)hdr->args.count, offset);
 +
 +      ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
 +      if (!ds)
 +              goto out_failed;
 +
 +      ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
 +                                                 hdr->inode);
 +      if (IS_ERR(ds_clnt))
 +              goto out_failed;
 +
 +      ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
 +      if (IS_ERR(ds_cred))
 +              goto out_failed;
 +
 +      vers = nfs4_ff_layout_ds_version(lseg, idx);
 +
 +      dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
 +              ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
 +
 +      atomic_inc(&ds->ds_clp->cl_count);
 +      hdr->ds_clp = ds->ds_clp;
 +      fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
 +      if (fh)
 +              hdr->args.fh = fh;
 +
 +      /*
 +       * Note that if we ever decide to split across DSes,
 +       * then we may need to handle dense-like offsets.
 +       */
 +      hdr->args.offset = offset;
 +      hdr->mds_offset = offset;
 +
 +      /* Perform an asynchronous read to ds */
 +      nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
 +                        vers == 3 ? &ff_layout_read_call_ops_v3 :
 +                                    &ff_layout_read_call_ops_v4,
 +                        0, RPC_TASK_SOFTCONN);
 +
 +      return PNFS_ATTEMPTED;
 +
 +out_failed:
 +      if (ff_layout_has_available_ds(lseg))
 +              return PNFS_TRY_AGAIN;
 +      return PNFS_NOT_ATTEMPTED;
 +}
 +
 +/* Perform async writes. */
 +static enum pnfs_try_status
 +ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 +{
 +      struct pnfs_layout_segment *lseg = hdr->lseg;
 +      struct nfs4_pnfs_ds *ds;
 +      struct rpc_clnt *ds_clnt;
 +      struct rpc_cred *ds_cred;
 +      loff_t offset = hdr->args.offset;
 +      int vers;
 +      struct nfs_fh *fh;
 +      int idx = hdr->pgio_mirror_idx;
 +
 +      ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
 +      if (!ds)
 +              return PNFS_NOT_ATTEMPTED;
 +
 +      ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
 +                                                 hdr->inode);
 +      if (IS_ERR(ds_clnt))
 +              return PNFS_NOT_ATTEMPTED;
 +
 +      ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
 +      if (IS_ERR(ds_cred))
 +              return PNFS_NOT_ATTEMPTED;
 +
 +      vers = nfs4_ff_layout_ds_version(lseg, idx);
 +
 +      dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
 +              __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
 +              offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
 +              vers);
 +
 +      hdr->pgio_done_cb = ff_layout_write_done_cb;
 +      atomic_inc(&ds->ds_clp->cl_count);
 +      hdr->ds_clp = ds->ds_clp;
 +      hdr->ds_commit_idx = idx;
 +      fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
 +      if (fh)
 +              hdr->args.fh = fh;
 +
 +      /*
 +       * Note that if we ever decide to split across DSes,
 +       * then we may need to handle dense-like offsets.
 +       */
 +      hdr->args.offset = offset;
 +
 +      /* Perform an asynchronous write */
 +      nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
 +                        vers == 3 ? &ff_layout_write_call_ops_v3 :
 +                                    &ff_layout_write_call_ops_v4,
 +                        sync, RPC_TASK_SOFTCONN);
 +      return PNFS_ATTEMPTED;
 +}
 +
 +static void
 +ff_layout_mark_request_commit(struct nfs_page *req,
 +                            struct pnfs_layout_segment *lseg,
 +                            struct nfs_commit_info *cinfo,
 +                            u32 ds_commit_idx)
 +{
 +      struct list_head *list;
 +      struct pnfs_commit_bucket *buckets;
 +
 +      spin_lock(cinfo->lock);
 +      buckets = cinfo->ds->buckets;
 +      list = &buckets[ds_commit_idx].written;
 +      if (list_empty(list)) {
 +              /* Non-empty buckets hold a reference on the lseg.  That ref
 +               * is normally transferred to the COMMIT call and released
 +               * there.  It could also be released if the last req is pulled
 +               * off due to a rewrite, in which case it will be done in
 +               * pnfs_common_clear_request_commit
 +               */
 +              WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
 +              buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
 +      }
 +      set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 +      cinfo->ds->nwritten++;
 +
 +      /* nfs_request_add_commit_list(). We need to add req to list without
 +       * dropping cinfo lock.
 +       */
 +      set_bit(PG_CLEAN, &(req)->wb_flags);
 +      nfs_list_add_request(req, list);
 +      cinfo->mds->ncommit++;
 +      spin_unlock(cinfo->lock);
 +      if (!cinfo->dreq) {
 +              inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
++              inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 +                           BDI_RECLAIMABLE);
 +              __mark_inode_dirty(req->wb_context->dentry->d_inode,
 +                                 I_DIRTY_DATASYNC);
 +      }
 +}
 +
 +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
 +{
 +      return i;
 +}
 +
 +static struct nfs_fh *
 +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
 +{
 +      struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
 +
 +      /* FIXME: Assume that there is only one NFS version available
 +       * for the DS.
 +       */
 +      return &flseg->mirror_array[i]->fh_versions[0];
 +}
 +
 +static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 +{
 +      struct pnfs_layout_segment *lseg = data->lseg;
 +      struct nfs4_pnfs_ds *ds;
 +      struct rpc_clnt *ds_clnt;
 +      struct rpc_cred *ds_cred;
 +      u32 idx;
 +      int vers;
 +      struct nfs_fh *fh;
 +
 +      idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
 +      ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
 +      if (!ds)
 +              goto out_err;
 +
 +      ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
 +                                                 data->inode);
 +      if (IS_ERR(ds_clnt))
 +              goto out_err;
 +
 +      ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
 +      if (IS_ERR(ds_cred))
 +              goto out_err;
 +
 +      vers = nfs4_ff_layout_ds_version(lseg, idx);
 +
 +      dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
 +              data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
 +              vers);
 +      data->commit_done_cb = ff_layout_commit_done_cb;
 +      data->cred = ds_cred;
 +      atomic_inc(&ds->ds_clp->cl_count);
 +      data->ds_clp = ds->ds_clp;
 +      fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 +      if (fh)
 +              data->args.fh = fh;
 +      return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
 +                                 vers == 3 ? &ff_layout_commit_call_ops_v3 :
 +                                             &ff_layout_commit_call_ops_v4,
 +                                 how, RPC_TASK_SOFTCONN);
 +out_err:
 +      pnfs_generic_prepare_to_resend_writes(data);
 +      pnfs_generic_commit_release(data);
 +      return -EAGAIN;
 +}
 +
 +static int
 +ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 +                         int how, struct nfs_commit_info *cinfo)
 +{
 +      return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
 +                                          ff_layout_initiate_commit);
 +}
 +
 +static struct pnfs_ds_commit_info *
 +ff_layout_get_ds_info(struct inode *inode)
 +{
 +      struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
 +
 +      if (layout == NULL)
 +              return NULL;
 +
 +      return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
 +}
 +
 +static void
 +ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
 +{
 +      nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
 +                                                id_node));
 +}
 +
 +static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
 +                                struct xdr_stream *xdr,
 +                                const struct nfs4_layoutreturn_args *args)
 +{
 +      struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
 +      __be32 *start;
 +      int count = 0, ret = 0;
 +
 +      start = xdr_reserve_space(xdr, 4);
 +      if (unlikely(!start))
 +              return -E2BIG;
 +
 +      /* This assume we always return _ALL_ layouts */
 +      spin_lock(&hdr->plh_inode->i_lock);
 +      ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
 +      spin_unlock(&hdr->plh_inode->i_lock);
 +
 +      *start = cpu_to_be32(count);
 +
 +      return ret;
 +}
 +
 +/* report nothing for now */
 +static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
 +                                   struct xdr_stream *xdr,
 +                                   const struct nfs4_layoutreturn_args *args)
 +{
 +      __be32 *p;
 +
 +      p = xdr_reserve_space(xdr, 4);
 +      if (likely(p))
 +              *p = cpu_to_be32(0);
 +}
 +
 +static struct nfs4_deviceid_node *
 +ff_layout_alloc_deviceid_node(struct nfs_server *server,
 +                            struct pnfs_device *pdev, gfp_t gfp_flags)
 +{
 +      struct nfs4_ff_layout_ds *dsaddr;
 +
 +      dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
 +      if (!dsaddr)
 +              return NULL;
 +      return &dsaddr->id_node;
 +}
 +
 +static void
 +ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
 +                            struct xdr_stream *xdr,
 +                            const struct nfs4_layoutreturn_args *args)
 +{
 +      struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
 +      __be32 *start;
 +
 +      dprintk("%s: Begin\n", __func__);
 +      start = xdr_reserve_space(xdr, 4);
 +      BUG_ON(!start);
 +
 +      if (ff_layout_encode_ioerr(flo, xdr, args))
 +              goto out;
 +
 +      ff_layout_encode_iostats(flo, xdr, args);
 +out:
 +      *start = cpu_to_be32((xdr->p - start - 1) * 4);
 +      dprintk("%s: Return\n", __func__);
 +}
 +
 +static struct pnfs_layoutdriver_type flexfilelayout_type = {
 +      .id                     = LAYOUT_FLEX_FILES,
 +      .name                   = "LAYOUT_FLEX_FILES",
 +      .owner                  = THIS_MODULE,
 +      .alloc_layout_hdr       = ff_layout_alloc_layout_hdr,
 +      .free_layout_hdr        = ff_layout_free_layout_hdr,
 +      .alloc_lseg             = ff_layout_alloc_lseg,
 +      .free_lseg              = ff_layout_free_lseg,
 +      .pg_read_ops            = &ff_layout_pg_read_ops,
 +      .pg_write_ops           = &ff_layout_pg_write_ops,
 +      .get_ds_info            = ff_layout_get_ds_info,
 +      .free_deviceid_node     = ff_layout_free_deveiceid_node,
 +      .mark_request_commit    = ff_layout_mark_request_commit,
 +      .clear_request_commit   = pnfs_generic_clear_request_commit,
 +      .scan_commit_lists      = pnfs_generic_scan_commit_lists,
 +      .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
 +      .commit_pagelist        = ff_layout_commit_pagelist,
 +      .read_pagelist          = ff_layout_read_pagelist,
 +      .write_pagelist         = ff_layout_write_pagelist,
 +      .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
 +      .encode_layoutreturn    = ff_layout_encode_layoutreturn,
 +};
 +
 +static int __init nfs4flexfilelayout_init(void)
 +{
 +      printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
 +             __func__);
 +      return pnfs_register_layoutdriver(&flexfilelayout_type);
 +}
 +
 +static void __exit nfs4flexfilelayout_exit(void)
 +{
 +      printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
 +             __func__);
 +      pnfs_unregister_layoutdriver(&flexfilelayout_type);
 +}
 +
 +MODULE_ALIAS("nfs-layouttype4-4");
 +
 +MODULE_LICENSE("GPL");
 +MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
 +
 +module_init(nfs4flexfilelayout_init);
 +module_exit(nfs4flexfilelayout_exit);
diff --combined fs/nfs/inode.c
index d2398c193bdaab4ce90d27917d3408e8d610b260,24aac72420f4d4db2cffe55650b5d9621a1f52ee..e4f0dcef8f5455e60676bf70d9d0489107ad7c79
@@@ -352,9 -352,8 +352,9 @@@ nfs_fhget(struct super_block *sb, struc
  
        nfs_attr_check_mountpoint(sb, fattr);
  
 -      if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) &&
 -          !nfs_attr_use_mounted_on_fileid(fattr))
 +      if (nfs_attr_use_mounted_on_fileid(fattr))
 +              fattr->fileid = fattr->mounted_on_fileid;
 +      else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
                goto out_no_inode;
        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
                goto out_no_inode;
                if (S_ISREG(inode->i_mode)) {
                        inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
                        inode->i_data.a_ops = &nfs_file_aops;
-                       inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
@@@ -507,15 -505,10 +506,15 @@@ nfs_setattr(struct dentry *dentry, stru
                attr->ia_valid &= ~ATTR_MODE;
  
        if (attr->ia_valid & ATTR_SIZE) {
 +              loff_t i_size;
 +
                BUG_ON(!S_ISREG(inode->i_mode));
  
 -              if (attr->ia_size == i_size_read(inode))
 +              i_size = i_size_read(inode);
 +              if (attr->ia_size == i_size)
                        attr->ia_valid &= ~ATTR_SIZE;
 +              else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
 +                      return -ETXTBSY;
        }
  
        /* Optimization: if the end result is no change, don't RPC */
diff --combined fs/nfs/internal.h
index 21469e6e3834270a767f7e6dd3c59970efb37a56,f519d4187332f04b2e18507ebae9496e97576f0a..212b8c883d22881b4258c2c7f7e611d9d85f0427
@@@ -6,7 -6,6 +6,7 @@@
  #include <linux/mount.h>
  #include <linux/security.h>
  #include <linux/crc32.h>
 +#include <linux/nfs_page.h>
  
  #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
  
@@@ -32,6 -31,8 +32,6 @@@ static inline int nfs_attr_use_mounted_
            (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
             ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
                return 0;
 -
 -      fattr->fileid = fattr->mounted_on_fileid;
        return 1;
  }
  
@@@ -188,15 -189,9 +188,15 @@@ extern struct nfs_client *nfs4_set_ds_c
                                             const struct sockaddr *ds_addr,
                                             int ds_addrlen, int ds_proto,
                                             unsigned int ds_timeo,
 -                                           unsigned int ds_retrans);
 +                                           unsigned int ds_retrans,
 +                                           u32 minor_version,
 +                                           rpc_authflavor_t au_flavor);
  extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
                                                struct inode *);
 +extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
 +                      const struct sockaddr *ds_addr, int ds_addrlen,
 +                      int ds_proto, unsigned int ds_timeo,
 +                      unsigned int ds_retrans, rpc_authflavor_t au_flavor);
  #ifdef CONFIG_PROC_FS
  extern int __init nfs_fs_proc_init(void);
  extern void nfs_fs_proc_exit(void);
@@@ -249,12 -244,9 +249,12 @@@ struct nfs_pgio_header *nfs_pgio_header
  void nfs_pgio_header_free(struct nfs_pgio_header *);
  void nfs_pgio_data_destroy(struct nfs_pgio_header *);
  int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
 -int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
 -                    const struct rpc_call_ops *, int, int);
 +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
 +                    struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
 +                    const struct rpc_call_ops *call_ops, int how, int flags);
  void nfs_free_request(struct nfs_page *req);
 +struct nfs_pgio_mirror *
 +nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
  
  static inline void nfs_iocounter_init(struct nfs_io_counter *c)
  {
        atomic_set(&c->io_count, 0);
  }
  
 +static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
 +{
 +      WARN_ON_ONCE(desc->pg_mirror_count < 1);
 +      return desc->pg_mirror_count > 1;
 +}
 +
  /* nfs2xdr.c */
  extern struct rpc_procinfo nfs_procedures[];
  extern int nfs2_decode_dirent(struct xdr_stream *,
@@@ -391,7 -377,7 +391,7 @@@ extern struct rpc_stat nfs_rpcstat
  
  extern int __init register_nfs_fs(void);
  extern void __exit unregister_nfs_fs(void);
 -extern void nfs_sb_active(struct super_block *sb);
 +extern bool nfs_sb_active(struct super_block *sb);
  extern void nfs_sb_deactive(struct super_block *sb);
  
  /* namespace.c */
@@@ -430,7 -416,6 +430,6 @@@ int  nfs_show_options(struct seq_file *
  int  nfs_show_devname(struct seq_file *, struct dentry *);
  int  nfs_show_path(struct seq_file *, struct dentry *);
  int  nfs_show_stats(struct seq_file *, struct dentry *);
- void nfs_put_super(struct super_block *);
  int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
  
  /* write.c */
@@@ -443,7 -428,6 +442,7 @@@ extern void nfs_write_prepare(struct rp
  extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
  extern int nfs_initiate_commit(struct rpc_clnt *clnt,
                               struct nfs_commit_data *data,
 +                             const struct nfs_rpc_ops *nfs_ops,
                               const struct rpc_call_ops *call_ops,
                               int how, int flags);
  extern void nfs_init_commit(struct nfs_commit_data *data,
@@@ -457,15 -441,13 +456,15 @@@ int nfs_scan_commit(struct inode *inode
                    struct nfs_commit_info *cinfo);
  void nfs_mark_request_commit(struct nfs_page *req,
                             struct pnfs_layout_segment *lseg,
 -                           struct nfs_commit_info *cinfo);
 +                           struct nfs_commit_info *cinfo,
 +                           u32 ds_commit_idx);
  int nfs_write_need_commit(struct nfs_pgio_header *);
  int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
  void nfs_retry_commit(struct list_head *page_list,
                      struct pnfs_layout_segment *lseg,
 -                    struct nfs_commit_info *cinfo);
 +                    struct nfs_commit_info *cinfo,
 +                    u32 ds_commit_idx);
  void nfs_commitdata_release(struct nfs_commit_data *data);
  void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
                                 struct nfs_commit_info *cinfo);
@@@ -476,7 -458,6 +475,7 @@@ void nfs_init_cinfo(struct nfs_commit_i
                    struct nfs_direct_req *dreq);
  int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
  bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
 +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
  
  #ifdef CONFIG_MIGRATION
  extern int nfs_migrate_page(struct address_space *,
@@@ -500,7 -481,6 +499,7 @@@ static inline void nfs_inode_dio_wait(s
        inode_dio_wait(inode);
  }
  extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 +extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
  
  /* nfs4proc.c */
  extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@@ -514,26 -494,6 +513,26 @@@ extern int nfs41_walk_client_list(struc
                                struct nfs_client **result,
                                struct rpc_cred *cred);
  
 +static inline struct inode *nfs_igrab_and_active(struct inode *inode)
 +{
 +      inode = igrab(inode);
 +      if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
 +              iput(inode);
 +              inode = NULL;
 +      }
 +      return inode;
 +}
 +
 +static inline void nfs_iput_and_deactive(struct inode *inode)
 +{
 +      if (inode != NULL) {
 +              struct super_block *sb = inode->i_sb;
 +
 +              iput(inode);
 +              nfs_sb_deactive(sb);
 +      }
 +}
 +
  /*
   * Determine the device name as a string
   */
diff --combined fs/nfs/nfs4super.c
index 48cea3c30e5de02bae68e6aa7038f0f530096dd3,ab30a3a637deb2ace9ecc142539b31410bfcc89c..75090feeafade9c790bb4bf0d84c8546a46819a0
@@@ -53,7 -53,6 +53,6 @@@ static const struct super_operations nf
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs4_write_inode,
        .drop_inode     = nfs_drop_inode,
-       .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
@@@ -346,9 -345,6 +345,9 @@@ out
  
  static void __exit exit_nfs_v4(void)
  {
 +      /* Not called in the _init(), conditionally loaded */
 +      nfs4_pnfs_v3_ds_connect_unload();
 +
        unregister_nfs_version(&nfs_v4);
        nfs4_unregister_sysctl();
        nfs_idmap_quit();
diff --combined fs/nfs/super.c
index 368d9395d2e7f064c9539ff01d9353659ad5197e,6ec4fe23b756b98115a0a6064e475fa3659635fe..322b2de02988fa6479c2d2831a5e5d70dfdad7c9
@@@ -311,7 -311,6 +311,6 @@@ const struct super_operations nfs_sops 
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
        .drop_inode     = nfs_drop_inode,
-       .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
@@@ -405,15 -404,12 +404,15 @@@ void __exit unregister_nfs_fs(void
        unregister_filesystem(&nfs_fs_type);
  }
  
 -void nfs_sb_active(struct super_block *sb)
 +bool nfs_sb_active(struct super_block *sb)
  {
        struct nfs_server *server = NFS_SB(sb);
  
 -      if (atomic_inc_return(&server->active) == 1)
 -              atomic_inc(&sb->s_active);
 +      if (!atomic_inc_not_zero(&sb->s_active))
 +              return false;
 +      if (atomic_inc_return(&server->active) != 1)
 +              atomic_dec(&sb->s_active);
 +      return true;
  }
  EXPORT_SYMBOL_GPL(nfs_sb_active);
  
@@@ -2572,7 -2568,7 +2571,7 @@@ struct dentry *nfs_fs_mount_common(stru
                error = nfs_bdi_register(server);
                if (error) {
                        mntroot = ERR_PTR(error);
-                       goto error_splat_bdi;
+                       goto error_splat_super;
                }
                server->super = s;
        }
@@@ -2604,9 -2600,6 +2603,6 @@@ error_splat_root
        dput(mntroot);
        mntroot = ERR_PTR(error);
  error_splat_super:
-       if (server && !s->s_root)
-               bdi_unregister(&server->backing_dev_info);
- error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
  }
  }
  EXPORT_SYMBOL_GPL(nfs_fs_mount);
  
- /*
-  * Ensure that we unregister the bdi before kill_anon_super
-  * releases the device name
-  */
- void nfs_put_super(struct super_block *s)
- {
-       struct nfs_server *server = NFS_SB(s);
-       bdi_unregister(&server->backing_dev_info);
- }
- EXPORT_SYMBOL_GPL(nfs_put_super);
  /*
   * Destroy an NFS2/3 superblock
   */
  void nfs_kill_super(struct super_block *s)
  {
        struct nfs_server *server = NFS_SB(s);
+       dev_t dev = s->s_dev;
+       generic_shutdown_super(s);
  
-       kill_anon_super(s);
        nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
+       free_anon_bdev(dev);
  }
  EXPORT_SYMBOL_GPL(nfs_kill_super);
  
diff --combined fs/nfs/write.c
index bcf83e535f29a0060f4abb03f8c3f43684eb4880,298abcc5281b66c9a2266cf2ffd4228bd791f3cc..88a6d2196ece3bf5ce7a94027dd96e8f25bc9792
@@@ -473,18 -473,13 +473,18 @@@ try_again
        do {
                /*
                 * Subrequests are always contiguous, non overlapping
 -               * and in order. If not, it's a programming error.
 +               * and in order - but may be repeated (mirrored writes).
                 */
 -              WARN_ON_ONCE(subreq->wb_offset !=
 -                   (head->wb_offset + total_bytes));
 -
 -              /* keep track of how many bytes this group covers */
 -              total_bytes += subreq->wb_bytes;
 +              if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
 +                      /* keep track of how many bytes this group covers */
 +                      total_bytes += subreq->wb_bytes;
 +              } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 +                          ((subreq->wb_offset + subreq->wb_bytes) >
 +                           (head->wb_offset + total_bytes)))) {
 +                      nfs_page_group_unlock(head);
 +                      spin_unlock(&inode->i_lock);
 +                      return ERR_PTR(-EIO);
 +              }
  
                if (!nfs_lock_request(subreq)) {
                        /* releases page group bit lock and
@@@ -791,7 -786,7 +791,7 @@@ nfs_request_add_commit_list(struct nfs_
        spin_unlock(cinfo->lock);
        if (!cinfo->dreq) {
                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-               inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+               inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
                             BDI_RECLAIMABLE);
                __mark_inode_dirty(req->wb_context->dentry->d_inode,
                                   I_DIRTY_DATASYNC);
@@@ -847,9 -842,9 +847,9 @@@ EXPORT_SYMBOL_GPL(nfs_init_cinfo)
   */
  void
  nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 -                      struct nfs_commit_info *cinfo)
 +                      struct nfs_commit_info *cinfo, u32 ds_commit_idx)
  {
 -      if (pnfs_mark_request_commit(req, lseg, cinfo))
 +      if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
                return;
        nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
  }
@@@ -858,7 -853,7 +858,7 @@@ static voi
  nfs_clear_page_commit(struct page *page)
  {
        dec_zone_page_state(page, NR_UNSTABLE_NFS);
-       dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+       dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
  }
  
  /* Called holding inode (/cinfo) lock */
@@@ -905,8 -900,7 +905,8 @@@ static void nfs_write_completion(struc
                }
                if (nfs_write_need_commit(hdr)) {
                        memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
 -                      nfs_mark_request_commit(req, hdr->lseg, &cinfo);
 +                      nfs_mark_request_commit(req, hdr->lseg, &cinfo,
 +                              hdr->pgio_mirror_idx);
                        goto next;
                }
  remove_req:
@@@ -1097,7 -1091,6 +1097,7 @@@ int nfs_flush_incompatible(struct file 
  {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct nfs_lock_context *l_ctx;
 +      struct file_lock_context *flctx = file_inode(file)->i_flctx;
        struct nfs_page *req;
        int do_flush, status;
        /*
                do_flush = req->wb_page != page || req->wb_context != ctx;
                /* for now, flush if more than 1 request in page_group */
                do_flush |= req->wb_this_page != req;
 -              if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
 +              if (l_ctx && flctx &&
 +                  !(list_empty_careful(&flctx->flc_posix) &&
 +                    list_empty_careful(&flctx->flc_flock))) {
                        do_flush |= l_ctx->lockowner.l_owner != current->files
                                || l_ctx->lockowner.l_pid != current->tgid;
                }
        return PageUptodate(page) != 0;
  }
  
 +static bool
 +is_whole_file_wrlock(struct file_lock *fl)
 +{
 +      return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
 +                      fl->fl_type == F_WRLCK;
 +}
 +
  /* If we know the page is up to date, and we're not using byte range locks (or
   * if we have the whole file locked for writing), it may be more efficient to
   * extend the write to cover the entire page in order to avoid fragmentation
   */
  static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
  {
 +      int ret;
 +      struct file_lock_context *flctx = inode->i_flctx;
 +      struct file_lock *fl;
 +
        if (file->f_flags & O_DSYNC)
                return 0;
        if (!nfs_write_pageuptodate(page, inode))
                return 0;
        if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
                return 1;
 -      if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
 -                      inode->i_flock->fl_end == OFFSET_MAX &&
 -                      inode->i_flock->fl_type != F_RDLCK))
 -              return 1;
 -      return 0;
 +      if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
 +                     list_empty_careful(&flctx->flc_posix)))
 +              return 0;
 +
 +      /* Check to see if there are whole file write locks */
 +      ret = 0;
 +      spin_lock(&flctx->flc_lock);
 +      if (!list_empty(&flctx->flc_posix)) {
 +              fl = list_first_entry(&flctx->flc_posix, struct file_lock,
 +                                      fl_list);
 +              if (is_whole_file_wrlock(fl))
 +                      ret = 1;
 +      } else if (!list_empty(&flctx->flc_flock)) {
 +              fl = list_first_entry(&flctx->flc_flock, struct file_lock,
 +                                      fl_list);
 +              if (fl->fl_type == F_WRLCK)
 +                      ret = 1;
 +      }
 +      spin_unlock(&flctx->flc_lock);
 +      return ret;
  }
  
  /*
@@@ -1275,15 -1240,15 +1275,15 @@@ static int flush_task_priority(int how
  
  static void nfs_initiate_write(struct nfs_pgio_header *hdr,
                               struct rpc_message *msg,
 +                             const struct nfs_rpc_ops *rpc_ops,
                               struct rpc_task_setup *task_setup_data, int how)
  {
 -      struct inode *inode = hdr->inode;
        int priority = flush_task_priority(how);
  
        task_setup_data->priority = priority;
 -      NFS_PROTO(inode)->write_setup(hdr, msg);
 +      rpc_ops->write_setup(hdr, msg);
  
 -      nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
 +      nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
                                 &task_setup_data->rpc_client, msg, hdr);
  }
  
@@@ -1333,14 -1298,8 +1333,14 @@@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write
  
  void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
  {
 +      struct nfs_pgio_mirror *mirror;
 +
        pgio->pg_ops = &nfs_pgio_rw_ops;
 -      pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
 +
 +      nfs_pageio_stop_mirroring(pgio);
 +
 +      mirror = &pgio->pg_mirrors[0];
 +      mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
  }
  EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
  
@@@ -1506,7 -1465,6 +1506,7 @@@ void nfs_commitdata_release(struct nfs_
  EXPORT_SYMBOL_GPL(nfs_commitdata_release);
  
  int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 +                      const struct nfs_rpc_ops *nfs_ops,
                        const struct rpc_call_ops *call_ops,
                        int how, int flags)
  {
                .priority = priority,
        };
        /* Set up the initial task struct.  */
 -      NFS_PROTO(data->inode)->commit_setup(data, &msg);
 +      nfs_ops->commit_setup(data, &msg);
  
        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
  
@@@ -1596,18 -1554,17 +1596,18 @@@ EXPORT_SYMBOL_GPL(nfs_init_commit)
  
  void nfs_retry_commit(struct list_head *page_list,
                      struct pnfs_layout_segment *lseg,
 -                    struct nfs_commit_info *cinfo)
 +                    struct nfs_commit_info *cinfo,
 +                    u32 ds_commit_idx)
  {
        struct nfs_page *req;
  
        while (!list_empty(page_list)) {
                req = nfs_list_entry(page_list->next);
                nfs_list_remove_request(req);
 -              nfs_mark_request_commit(req, lseg, cinfo);
 +              nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
                if (!cinfo->dreq) {
                        dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                       dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+                       dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
                                     BDI_RECLAIMABLE);
                }
                nfs_unlock_and_release_request(req);
@@@ -1632,10 -1589,10 +1632,10 @@@ nfs_commit_list(struct inode *inode, st
        /* Set up the argument struct */
        nfs_init_commit(data, head, NULL, cinfo);
        atomic_inc(&cinfo->mds->rpcs_out);
 -      return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops,
 -                                 how, 0);
 +      return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
 +                                 data->mds_ops, how, 0);
   out_bad:
 -      nfs_retry_commit(head, NULL, cinfo);
 +      nfs_retry_commit(head, NULL, cinfo, 0);
        cinfo->completion_ops->error_cleanup(NFS_I(inode));
        return -ENOMEM;
  }
diff --combined fs/ocfs2/file.c
index 245db4f504dab424b6e7f7c7239790df42bf743e,abe7d98d6178683613194bd6a875f732fa56b4b8..e0f04d55fd0531e8be46260821c93b920af7bc7d
@@@ -569,7 -569,7 +569,7 @@@ static int __ocfs2_extend_allocation(st
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
 -      enum ocfs2_alloc_restarted why;
 +      enum ocfs2_alloc_restarted why = RESTART_NONE;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        int did_quota = 0;
@@@ -2363,7 -2363,7 +2363,7 @@@ relock
                        goto out_dio;
                }
        } else {
-               current->backing_dev_info = file->f_mapping->backing_dev_info;
+               current->backing_dev_info = inode_to_bdi(inode);
                written = generic_perform_write(file, from, *ppos);
                if (likely(written >= 0))
                        iocb->ki_pos = *ppos + written;
diff --combined fs/xfs/xfs_file.c
index f2d05a19d68cabdc09ceb61effbe02e98e3b2682,5684ac3e7d18311c7880247a08359e6029d1374b..1cdba95c78cb3e2475de29e0b6d88df3604e4cdf
@@@ -127,42 -127,6 +127,42 @@@ xfs_iozero
        return (-status);
  }
  
 +int
 +xfs_update_prealloc_flags(
 +      struct xfs_inode        *ip,
 +      enum xfs_prealloc_flags flags)
 +{
 +      struct xfs_trans        *tp;
 +      int                     error;
 +
 +      tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
 +      error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
 +      if (error) {
 +              xfs_trans_cancel(tp, 0);
 +              return error;
 +      }
 +
 +      xfs_ilock(ip, XFS_ILOCK_EXCL);
 +      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 +
 +      if (!(flags & XFS_PREALLOC_INVISIBLE)) {
 +              ip->i_d.di_mode &= ~S_ISUID;
 +              if (ip->i_d.di_mode & S_IXGRP)
 +                      ip->i_d.di_mode &= ~S_ISGID;
 +              xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 +      }
 +
 +      if (flags & XFS_PREALLOC_SET)
 +              ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
 +      if (flags & XFS_PREALLOC_CLEAR)
 +              ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
 +
 +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 +      if (flags & XFS_PREALLOC_SYNC)
 +              xfs_trans_set_sync(tp);
 +      return xfs_trans_commit(tp, 0);
 +}
 +
  /*
   * Fsync operations on directories are much simpler than on regular files,
   * as there is no file data to flush, and thus also no need for explicit
@@@ -735,7 -699,7 +735,7 @@@ xfs_file_buffered_aio_write
  
        iov_iter_truncate(from, count);
        /* We can write back this queue in page reclaim */
-       current->backing_dev_info = mapping->backing_dev_info;
+       current->backing_dev_info = inode_to_bdi(inode);
  
  write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
@@@ -820,8 -784,8 +820,8 @@@ xfs_file_fallocate
  {
        struct inode            *inode = file_inode(file);
        struct xfs_inode        *ip = XFS_I(inode);
 -      struct xfs_trans        *tp;
        long                    error;
 +      enum xfs_prealloc_flags flags = 0;
        loff_t                  new_size = 0;
  
        if (!S_ISREG(inode->i_mode))
                if (error)
                        goto out_unlock;
        } else {
 +              flags |= XFS_PREALLOC_SET;
 +
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    offset + len > i_size_read(inode)) {
                        new_size = offset + len;
                        goto out_unlock;
        }
  
 -      tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
 -      error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
 -      if (error) {
 -              xfs_trans_cancel(tp, 0);
 -              goto out_unlock;
 -      }
 -
 -      xfs_ilock(ip, XFS_ILOCK_EXCL);
 -      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 -      ip->i_d.di_mode &= ~S_ISUID;
 -      if (ip->i_d.di_mode & S_IXGRP)
 -              ip->i_d.di_mode &= ~S_ISGID;
 -
 -      if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
 -              ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
 -
 -      xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 -      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 -
        if (file->f_flags & O_DSYNC)
 -              xfs_trans_set_sync(tp);
 -      error = xfs_trans_commit(tp, 0);
 +              flags |= XFS_PREALLOC_SYNC;
 +
 +      error = xfs_update_prealloc_flags(ip, flags);
        if (error)
                goto out_unlock;
  
@@@ -1404,4 -1384,5 +1404,4 @@@ static const struct vm_operations_struc
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_vm_page_mkwrite,
 -      .remap_pages    = generic_file_remap_pages,
  };
diff --combined include/linux/fs.h
index cdcb1e9d96139b283ffe816e6db75baa2614d4f7,2f717baefdf82169a975fe802a5ccaa7957dd6ea..ec0f1dc66b9ba2f0b03d55ce8fffb837844d5a41
@@@ -34,6 -34,7 +34,7 @@@
  #include <asm/byteorder.h>
  #include <uapi/linux/fs.h>
  
+ struct backing_dev_info;
  struct export_operations;
  struct hd_geometry;
  struct iovec;
@@@ -394,13 -395,13 +395,12 @@@ int pagecache_write_end(struct file *, 
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);
  
- struct backing_dev_info;
  struct address_space {
        struct inode            *host;          /* owner: inode, block_device */
        struct radix_tree_root  page_tree;      /* radix tree of all pages */
        spinlock_t              tree_lock;      /* and lock protecting it */
        atomic_t                i_mmap_writable;/* count VM_SHARED mappings */
        struct rb_root          i_mmap;         /* tree of private and shared mappings */
 -      struct list_head        i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
        struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
        /* Protected by tree_lock together with the radix tree */
        unsigned long           nrpages;        /* number of total pages */
        pgoff_t                 writeback_index;/* writeback starts here */
        const struct address_space_operations *a_ops;   /* methods */
        unsigned long           flags;          /* error bits/gfp mask */
-       struct backing_dev_info *backing_dev_info; /* device readahead, etc */
        spinlock_t              private_lock;   /* for use by the address_space */
        struct list_head        private_list;   /* ditto */
        void                    *private_data;  /* ditto */
@@@ -492,7 -492,8 +491,7 @@@ static inline void i_mmap_unlock_read(s
   */
  static inline int mapping_mapped(struct address_space *mapping)
  {
 -      return  !RB_EMPTY_ROOT(&mapping->i_mmap) ||
 -              !list_empty(&mapping->i_mmap_nonlinear);
 +      return  !RB_EMPTY_ROOT(&mapping->i_mmap);
  }
  
  /*
@@@ -623,7 -624,7 +622,7 @@@ struct inode 
        atomic_t                i_readcount; /* struct files open RO */
  #endif
        const struct file_operations    *i_fop; /* former ->i_op->default_file_ops */
 -      struct file_lock        *i_flock;
 +      struct file_lock_context        *i_flctx;
        struct address_space    i_data;
        struct list_head        i_devices;
        union {
@@@ -873,7 -874,6 +872,7 @@@ static inline struct file *get_file(str
  #define FL_DOWNGRADE_PENDING  256 /* Lease is being downgraded */
  #define FL_UNLOCK_PENDING     512 /* Lease is being broken */
  #define FL_OFDLCK     1024    /* lock is "owned" by struct file */
 +#define FL_LAYOUT     2048    /* outstanding pNFS layout */
  
  /*
   * Special return value from posix_lock_file() and vfs_lock_file() for
  /* legacy typedef, should eventually be removed */
  typedef void *fl_owner_t;
  
 +struct file_lock;
 +
  struct file_lock_operations {
        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
        void (*fl_release_private)(struct file_lock *);
@@@ -899,7 -897,7 +898,7 @@@ struct lock_manager_operations 
        void (*lm_notify)(struct file_lock *);  /* unblock callback */
        int (*lm_grant)(struct file_lock *, int);
        bool (*lm_break)(struct file_lock *);
 -      int (*lm_change)(struct file_lock **, int, struct list_head *);
 +      int (*lm_change)(struct file_lock *, int, struct list_head *);
        void (*lm_setup)(struct file_lock *, void **);
  };
  
@@@ -924,17 -922,17 +923,17 @@@ int locks_in_grace(struct net *)
   * FIXME: should we create a separate "struct lock_request" to help distinguish
   * these two uses?
   *
 - * The i_flock list is ordered by:
 + * The varous i_flctx lists are ordered by:
   *
 - * 1) lock type -- FL_LEASEs first, then FL_FLOCK, and finally FL_POSIX
 - * 2) lock owner
 - * 3) lock range start
 - * 4) lock range end
 + * 1) lock owner
 + * 2) lock range start
 + * 3) lock range end
   *
   * Obviously, the last two criteria only matter for POSIX locks.
   */
  struct file_lock {
        struct file_lock *fl_next;      /* singly linked list for this inode  */
 +      struct list_head fl_list;       /* link into file_lock_context */
        struct hlist_node fl_link;      /* node in global lists */
        struct list_head fl_block;      /* circular list of blocked processes */
        fl_owner_t fl_owner;
        } fl_u;
  };
  
 +struct file_lock_context {
 +      spinlock_t              flc_lock;
 +      struct list_head        flc_flock;
 +      struct list_head        flc_posix;
 +      struct list_head        flc_lease;
 +      int                     flc_flock_cnt;
 +      int                     flc_posix_cnt;
 +      int                     flc_lease_cnt;
 +};
 +
  /* The following constant reflects the upper bound of the file/locking space */
  #ifndef OFFSET_MAX
  #define INT_LIMIT(x)  (~((x)1 << (sizeof(x)*8 - 1)))
@@@ -1001,7 -989,6 +1000,7 @@@ extern int fcntl_setlease(unsigned int 
  extern int fcntl_getlease(struct file *filp);
  
  /* fs/locks.c */
 +void locks_free_lock_context(struct file_lock_context *ctx);
  void locks_free_lock(struct file_lock *fl);
  extern void locks_init_lock(struct file_lock *);
  extern struct file_lock * locks_alloc_lock(void);
@@@ -1022,7 -1009,7 +1021,7 @@@ extern int __break_lease(struct inode *
  extern void lease_get_mtime(struct inode *, struct timespec *time);
  extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
  extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
 -extern int lease_modify(struct file_lock **, int, struct list_head *);
 +extern int lease_modify(struct file_lock *, int, struct list_head *);
  #else /* !CONFIG_FILE_LOCKING */
  static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                              struct flock __user *user)
@@@ -1059,11 -1046,6 +1058,11 @@@ static inline int fcntl_getlease(struc
        return F_UNLCK;
  }
  
 +static inline void
 +locks_free_lock_context(struct file_lock_context *ctx)
 +{
 +}
 +
  static inline void locks_init_lock(struct file_lock *fl)
  {
        return;
@@@ -1154,7 -1136,7 +1153,7 @@@ static inline int vfs_setlease(struct f
        return -EINVAL;
  }
  
 -static inline int lease_modify(struct file_lock **before, int arg,
 +static inline int lease_modify(struct file_lock *fl, int arg,
                               struct list_head *dispose)
  {
        return -EINVAL;
@@@ -1201,8 -1183,6 +1200,6 @@@ struct mm_struct
  #define UMOUNT_NOFOLLOW       0x00000008      /* Don't follow symlink on umount */
  #define UMOUNT_UNUSED 0x80000000      /* Flag guaranteed to be unused */
  
- extern struct list_head super_blocks;
- extern spinlock_t sb_lock;
  
  /* Possible states of 'frozen' field */
  enum {
@@@ -1519,6 -1499,26 +1516,26 @@@ struct block_device_operations
  #define HAVE_COMPAT_IOCTL 1
  #define HAVE_UNLOCKED_IOCTL 1
  
+ /*
+  * These flags let !MMU mmap() govern direct device mapping vs immediate
+  * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
+  *
+  * NOMMU_MAP_COPY:    Copy can be mapped (MAP_PRIVATE)
+  * NOMMU_MAP_DIRECT:  Can be mapped directly (MAP_SHARED)
+  * NOMMU_MAP_READ:    Can be mapped for reading
+  * NOMMU_MAP_WRITE:   Can be mapped for writing
+  * NOMMU_MAP_EXEC:    Can be mapped for execution
+  */
+ #define NOMMU_MAP_COPY                0x00000001
+ #define NOMMU_MAP_DIRECT      0x00000008
+ #define NOMMU_MAP_READ                VM_MAYREAD
+ #define NOMMU_MAP_WRITE               VM_MAYWRITE
+ #define NOMMU_MAP_EXEC                VM_MAYEXEC
+ #define NOMMU_VMFLAGS \
+       (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
  struct iov_iter;
  
  struct file_operations {
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
+ #ifndef CONFIG_MMU
+       unsigned (*mmap_capabilities)(struct file *);
+ #endif
  };
  
  struct inode_operations {
@@@ -1976,7 -1979,7 +1996,7 @@@ static inline int locks_verify_truncate
                                    struct file *filp,
                                    loff_t size)
  {
 -      if (inode->i_flock && mandatory_lock(inode))
 +      if (inode->i_flctx && mandatory_lock(inode))
                return locks_mandatory_area(
                        FLOCK_VERIFY_WRITE, inode, filp,
                        size < inode->i_size ? size : inode->i_size,
@@@ -1990,12 -1993,11 +2010,12 @@@ static inline int break_lease(struct in
  {
        /*
         * Since this check is lockless, we must ensure that any refcounts
 -       * taken are done before checking inode->i_flock. Otherwise, we could
 -       * end up racing with tasks trying to set a new lease on this file.
 +       * taken are done before checking i_flctx->flc_lease. Otherwise, we
 +       * could end up racing with tasks trying to set a new lease on this
 +       * file.
         */
        smp_mb();
 -      if (inode->i_flock)
 +      if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_LEASE);
        return 0;
  }
@@@ -2004,12 -2006,11 +2024,12 @@@ static inline int break_deleg(struct in
  {
        /*
         * Since this check is lockless, we must ensure that any refcounts
 -       * taken are done before checking inode->i_flock. Otherwise, we could
 -       * end up racing with tasks trying to set a new lease on this file.
 +       * taken are done before checking i_flctx->flc_lease. Otherwise, we
 +       * could end up racing with tasks trying to set a new lease on this
 +       * file.
         */
        smp_mb();
 -      if (inode->i_flock)
 +      if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_DELEG);
        return 0;
  }
@@@ -2036,16 -2037,6 +2056,16 @@@ static inline int break_deleg_wait(stru
        return ret;
  }
  
 +static inline int break_layout(struct inode *inode, bool wait)
 +{
 +      smp_mb();
 +      if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
 +              return __break_lease(inode,
 +                              wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
 +                              FL_LAYOUT);
 +      return 0;
 +}
 +
  #else /* !CONFIG_FILE_LOCKING */
  static inline int locks_mandatory_locked(struct file *file)
  {
@@@ -2101,11 -2092,6 +2121,11 @@@ static inline int break_deleg_wait(stru
        return 0;
  }
  
 +static inline int break_layout(struct inode *inode, bool wait)
 +{
 +      return 0;
 +}
 +
  #endif /* CONFIG_FILE_LOCKING */
  
  /* fs/open.c */
@@@ -2515,6 -2501,8 +2535,6 @@@ extern int sb_min_blocksize(struct supe
  
  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
 -              unsigned long size, pgoff_t pgoff);
  int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
  extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
  extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --combined mm/filemap.c
index bf7a271427041342c9a2fab5ae53fc563ad52f44,5d7c23c26f81bc978124f3a7c326e12a066c3f26..d9f5336552d7b12cad62315a2c512a9ca922bf55
@@@ -211,7 -211,7 +211,7 @@@ void __delete_from_page_cache(struct pa
         */
        if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
                dec_zone_page_state(page, NR_FILE_DIRTY);
-               dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
        }
  }
  
@@@ -2087,6 -2087,7 +2087,6 @@@ const struct vm_operations_struct gener
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = filemap_page_mkwrite,
 -      .remap_pages    = generic_file_remap_pages,
  };
  
  /* This is used for a general mmap of a disk file */
@@@ -2564,7 -2565,7 +2564,7 @@@ ssize_t __generic_file_write_iter(struc
        size_t          count = iov_iter_count(from);
  
        /* We can write back this queue in page reclaim */
-       current->backing_dev_info = mapping->backing_dev_info;
+       current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
                goto out;
diff --combined mm/filemap_xip.c
index 70c09da1a4198c3def28256c1b5c8cd79f16a5c6,26897fbfbe1977940e4e7a219ea32ebaad73f4ee..c175f9f25210a61896543e5eb371c90d534f8f3d
@@@ -9,6 -9,7 +9,7 @@@
   */
  
  #include <linux/fs.h>
+ #include <linux/backing-dev.h>
  #include <linux/pagemap.h>
  #include <linux/export.h>
  #include <linux/uio.h>
@@@ -301,6 -302,7 +302,6 @@@ out
  static const struct vm_operations_struct xip_file_vm_ops = {
        .fault  = xip_file_fault,
        .page_mkwrite   = filemap_page_mkwrite,
 -      .remap_pages = generic_file_remap_pages,
  };
  
  int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@@ -409,7 -411,7 +410,7 @@@ xip_file_write(struct file *filp, cons
        count = len;
  
        /* We can write back this queue in page reclaim */
-       current->backing_dev_info = mapping->backing_dev_info;
+       current->backing_dev_info = inode_to_bdi(inode);
  
        ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
        if (ret)
diff --combined mm/madvise.c
index d79fb5e8f80a0b4148e4e8bfb48bed81097d04b6,1383a8916bc35dff06614bb16ddd3ed7678a9a28..1077cbdc8b5207a6b407f3b4d97f4f5e99862065
@@@ -155,7 -155,7 +155,7 @@@ static int swapin_walk_pmd_entry(pmd_t 
                pte = *(orig_pte + ((index - start) / PAGE_SIZE));
                pte_unmap_unlock(orig_pte, ptl);
  
 -              if (pte_present(pte) || pte_none(pte) || pte_file(pte))
 +              if (pte_present(pte) || pte_none(pte))
                        continue;
                entry = pte_to_swp_entry(pte);
                if (unlikely(non_swap_entry(entry)))
@@@ -222,19 -222,22 +222,22 @@@ static long madvise_willneed(struct vm_
        struct file *file = vma->vm_file;
  
  #ifdef CONFIG_SWAP
-       if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+       if (!file) {
                *prev = vma;
-               if (!file)
-                       force_swapin_readahead(vma, start, end);
-               else
-                       force_shm_swapin_readahead(vma, start, end,
-                                               file->f_mapping);
+               force_swapin_readahead(vma, start, end);
                return 0;
        }
- #endif
  
+       if (shmem_mapping(file->f_mapping)) {
+               *prev = vma;
+               force_shm_swapin_readahead(vma, start, end,
+                                       file->f_mapping);
+               return 0;
+       }
+ #else
        if (!file)
                return -EBADF;
+ #endif
  
        if (file->f_mapping->a_ops->get_xip_mem) {
                /* no bad return value, but ignore advice */
@@@ -278,7 -281,14 +281,7 @@@ static long madvise_dontneed(struct vm_
        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
                return -EINVAL;
  
 -      if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
 -              struct zap_details details = {
 -                      .nonlinear_vma = vma,
 -                      .last_index = ULONG_MAX,
 -              };
 -              zap_page_range(vma, start, end - start, &details);
 -      } else
 -              zap_page_range(vma, start, end - start, NULL);
 +      zap_page_range(vma, start, end - start, NULL);
        return 0;
  }
  
@@@ -296,7 -306,7 +299,7 @@@ static long madvise_remove(struct vm_ar
  
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
  
 -      if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
 +      if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
                return -EINVAL;
  
        f = vma->vm_file;
diff --combined mm/nommu.c
index 1a19fb3b04635549f63d90c9e62408cf55ad162c,13af96f35a4b9cafffdb9a7d42effbb36aba0451..7296360fc057e5bbf67b9904501fdbe98a97b1b2
@@@ -59,7 -59,6 +59,7 @@@
  #endif
  
  void *high_memory;
 +EXPORT_SYMBOL(high_memory);
  struct page *mem_map;
  unsigned long max_mapnr;
  unsigned long highest_memmap_pfn;
@@@ -214,39 -213,6 +214,39 @@@ long get_user_pages(struct task_struct 
  }
  EXPORT_SYMBOL(get_user_pages);
  
 +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 +                         unsigned long start, unsigned long nr_pages,
 +                         int write, int force, struct page **pages,
 +                         int *locked)
 +{
 +      return get_user_pages(tsk, mm, start, nr_pages, write, force,
 +                            pages, NULL);
 +}
 +EXPORT_SYMBOL(get_user_pages_locked);
 +
 +long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 +                             unsigned long start, unsigned long nr_pages,
 +                             int write, int force, struct page **pages,
 +                             unsigned int gup_flags)
 +{
 +      long ret;
 +      down_read(&mm->mmap_sem);
 +      ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
 +                           pages, NULL);
 +      up_read(&mm->mmap_sem);
 +      return ret;
 +}
 +EXPORT_SYMBOL(__get_user_pages_unlocked);
 +
 +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 +                           unsigned long start, unsigned long nr_pages,
 +                           int write, int force, struct page **pages)
 +{
 +      return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
 +                                       force, pages, 0);
 +}
 +EXPORT_SYMBOL(get_user_pages_unlocked);
 +
  /**
   * follow_pfn - look up PFN at a user virtual address
   * @vma: memory mapping
@@@ -980,9 -946,6 +980,6 @@@ static int validate_mmap_request(struc
                return -EOVERFLOW;
  
        if (file) {
-               /* validate file mapping requests */
-               struct address_space *mapping;
                /* files must support mmap */
                if (!file->f_op->mmap)
                        return -ENODEV;
                 * - we support chardevs that provide their own "memory"
                 * - we support files/blockdevs that are memory backed
                 */
-               mapping = file->f_mapping;
-               if (!mapping)
-                       mapping = file_inode(file)->i_mapping;
-               capabilities = 0;
-               if (mapping && mapping->backing_dev_info)
-                       capabilities = mapping->backing_dev_info->capabilities;
-               if (!capabilities) {
+               if (file->f_op->mmap_capabilities) {
+                       capabilities = file->f_op->mmap_capabilities(file);
+               } else {
                        /* no explicit capabilities set, so assume some
                         * defaults */
                        switch (file_inode(file)->i_mode & S_IFMT) {
                        case S_IFREG:
                        case S_IFBLK:
-                               capabilities = BDI_CAP_MAP_COPY;
+                               capabilities = NOMMU_MAP_COPY;
                                break;
  
                        case S_IFCHR:
                                capabilities =
-                                       BDI_CAP_MAP_DIRECT |
-                                       BDI_CAP_READ_MAP |
-                                       BDI_CAP_WRITE_MAP;
+                                       NOMMU_MAP_DIRECT |
+                                       NOMMU_MAP_READ |
+                                       NOMMU_MAP_WRITE;
                                break;
  
                        default:
                /* eliminate any capabilities that we can't support on this
                 * device */
                if (!file->f_op->get_unmapped_area)
-                       capabilities &= ~BDI_CAP_MAP_DIRECT;
+                       capabilities &= ~NOMMU_MAP_DIRECT;
                if (!file->f_op->read)
-                       capabilities &= ~BDI_CAP_MAP_COPY;
+                       capabilities &= ~NOMMU_MAP_COPY;
  
                /* The file shall have been opened with read permission. */
                if (!(file->f_mode & FMODE_READ))
                        if (locks_verify_locked(file))
                                return -EAGAIN;
  
-                       if (!(capabilities & BDI_CAP_MAP_DIRECT))
+                       if (!(capabilities & NOMMU_MAP_DIRECT))
                                return -ENODEV;
  
                        /* we mustn't privatise shared mappings */
-                       capabilities &= ~BDI_CAP_MAP_COPY;
+                       capabilities &= ~NOMMU_MAP_COPY;
                } else {
                        /* we're going to read the file into private memory we
                         * allocate */
-                       if (!(capabilities & BDI_CAP_MAP_COPY))
+                       if (!(capabilities & NOMMU_MAP_COPY))
                                return -ENODEV;
  
                        /* we don't permit a private writable mapping to be
                         * shared with the backing device */
                        if (prot & PROT_WRITE)
-                               capabilities &= ~BDI_CAP_MAP_DIRECT;
+                               capabilities &= ~NOMMU_MAP_DIRECT;
                }
  
-               if (capabilities & BDI_CAP_MAP_DIRECT) {
-                       if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
-                           ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
-                           ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
+               if (capabilities & NOMMU_MAP_DIRECT) {
+                       if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
+                           ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+                           ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
                            ) {
-                               capabilities &= ~BDI_CAP_MAP_DIRECT;
+                               capabilities &= ~NOMMU_MAP_DIRECT;
                                if (flags & MAP_SHARED) {
                                        printk(KERN_WARNING
                                               "MAP_SHARED not completely supported on !MMU\n");
                } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
                        /* handle implication of PROT_EXEC by PROT_READ */
                        if (current->personality & READ_IMPLIES_EXEC) {
-                               if (capabilities & BDI_CAP_EXEC_MAP)
+                               if (capabilities & NOMMU_MAP_EXEC)
                                        prot |= PROT_EXEC;
                        }
                } else if ((prot & PROT_READ) &&
                         (prot & PROT_EXEC) &&
-                        !(capabilities & BDI_CAP_EXEC_MAP)
+                        !(capabilities & NOMMU_MAP_EXEC)
                         ) {
                        /* backing file is not executable, try to copy */
-                       capabilities &= ~BDI_CAP_MAP_DIRECT;
+                       capabilities &= ~NOMMU_MAP_DIRECT;
                }
        } else {
                /* anonymous mappings are always memory backed and can be
                 * privately mapped
                 */
-               capabilities = BDI_CAP_MAP_COPY;
+               capabilities = NOMMU_MAP_COPY;
  
                /* handle PROT_EXEC implication by PROT_READ */
                if ((prot & PROT_READ) &&
@@@ -1129,7 -1086,7 +1120,7 @@@ static unsigned long determine_vm_flags
        vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
        /* vm_flags |= mm->def_flags; */
  
-       if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+       if (!(capabilities & NOMMU_MAP_DIRECT)) {
                /* attempt to share read-only copies of mapped file chunks */
                vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
                if (file && !(prot & PROT_WRITE))
                /* overlay a shareable mapping on the backing device or inode
                 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
                 * romfs/cramfs */
-               vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+               vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
                if (flags & MAP_SHARED)
                        vm_flags |= VM_SHARED;
        }
@@@ -1191,7 -1148,7 +1182,7 @@@ static int do_mmap_private(struct vm_ar
         * shared mappings on devices or memory
         * - VM_MAYSHARE will be set if it may attempt to share
         */
-       if (capabilities & BDI_CAP_MAP_DIRECT) {
+       if (capabilities & NOMMU_MAP_DIRECT) {
                ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
                if (ret == 0) {
                        /* shouldn't return success if we're not sharing */
@@@ -1380,7 -1337,7 +1371,7 @@@ unsigned long do_mmap_pgoff(struct fil
                        if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
                            !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
                                /* new mapping is not a subset of the region */
-                               if (!(capabilities & BDI_CAP_MAP_DIRECT))
+                               if (!(capabilities & NOMMU_MAP_DIRECT))
                                        goto sharing_violation;
                                continue;
                        }
                 * - this is the hook for quasi-memory character devices to
                 *   tell us the location of a shared mapping
                 */
-               if (capabilities & BDI_CAP_MAP_DIRECT) {
+               if (capabilities & NOMMU_MAP_DIRECT) {
                        addr = file->f_op->get_unmapped_area(file, addr, len,
                                                             pgoff, flags);
                        if (IS_ERR_VALUE(addr)) {
                                 * the mapping so we'll have to attempt to copy
                                 * it */
                                ret = -ENODEV;
-                               if (!(capabilities & BDI_CAP_MAP_COPY))
+                               if (!(capabilities & NOMMU_MAP_COPY))
                                        goto error_just_free;
  
-                               capabilities &= ~BDI_CAP_MAP_DIRECT;
+                               capabilities &= ~NOMMU_MAP_DIRECT;
                        } else {
                                vma->vm_start = region->vm_start = addr;
                                vma->vm_end = region->vm_end = addr + len;
        vma->vm_region = region;
  
        /* set up the mapping
-        * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+        * - the region is filled in if NOMMU_MAP_DIRECT is still set
         */
        if (file && vma->vm_flags & VM_SHARED)
                ret = do_mmap_shared_file(vma);
@@@ -1928,7 -1885,7 +1919,7 @@@ EXPORT_SYMBOL(unmap_mapping_range)
   */
  int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
  {
 -      unsigned long free, allowed, reserve;
 +      long free, allowed, reserve;
  
        vm_acct_memory(pages);
  
         */
        if (mm) {
                reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
 -              allowed -= min(mm->total_vm / 32, reserve);
 +              allowed -= min_t(long, mm->total_vm / 32, reserve);
        }
  
        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@@ -2017,6 -1974,14 +2008,6 @@@ void filemap_map_pages(struct vm_area_s
  }
  EXPORT_SYMBOL(filemap_map_pages);
  
 -int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
 -                           unsigned long size, pgoff_t pgoff)
 -{
 -      BUG();
 -      return 0;
 -}
 -EXPORT_SYMBOL(generic_file_remap_pages);
 -
  static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, int write)
  {
diff --combined mm/page-writeback.c
index 6a73e47e81c67fa86838941e6a16b27a956d77e7,d4cbb4bd7d1cf30c859f510dc636ef39d9e8f016..45e187b2d97183a90df9a5ee8558404f9f1bd826
@@@ -1351,7 -1351,7 +1351,7 @@@ static void balance_dirty_pages(struct 
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
        unsigned long pos_ratio;
-       struct backing_dev_info *bdi = mapping->backing_dev_info;
+       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
  
@@@ -1574,7 -1574,7 +1574,7 @@@ DEFINE_PER_CPU(int, dirty_throttle_leak
   */
  void balance_dirty_pages_ratelimited(struct address_space *mapping)
  {
-       struct backing_dev_info *bdi = mapping->backing_dev_info;
+       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        int ratelimit;
        int *p;
  
@@@ -1929,7 -1929,7 +1929,7 @@@ continue_unlock
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;
  
-                       trace_wbc_writepage(wbc, mapping->backing_dev_info);
+                       trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
                        ret = (*writepage)(page, wbc, data);
                        if (unlikely(ret)) {
                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@@ -2094,10 -2094,12 +2094,12 @@@ void account_page_dirtied(struct page *
        trace_writeback_dirty_page(page, mapping);
  
        if (mapping_cap_account_dirty(mapping)) {
+               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
-               __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
-               __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+               __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+               __inc_bdi_stat(bdi, BDI_DIRTIED);
                task_io_account_write(PAGE_CACHE_SIZE);
                current->nr_dirtied++;
                this_cpu_inc(bdp_ratelimits);
@@@ -2156,7 -2158,7 +2158,7 @@@ void account_page_redirty(struct page *
        if (mapping && mapping_cap_account_dirty(mapping)) {
                current->nr_dirtied--;
                dec_zone_page_state(page, NR_DIRTIED);
-               dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
        }
  }
  EXPORT_SYMBOL(account_page_redirty);
   */
  int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
  {
 +      int ret;
 +
        wbc->pages_skipped++;
 +      ret = __set_page_dirty_nobuffers(page);
        account_page_redirty(page);
 -      return __set_page_dirty_nobuffers(page);
 +      return ret;
  }
  EXPORT_SYMBOL(redirty_page_for_writepage);
  
@@@ -2298,7 -2297,7 +2300,7 @@@ int clear_page_dirty_for_io(struct pag
                 */
                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                       dec_bdi_stat(mapping->backing_dev_info,
+                       dec_bdi_stat(inode_to_bdi(mapping->host),
                                        BDI_RECLAIMABLE);
                        return 1;
                }
@@@ -2311,12 -2310,14 +2313,12 @@@ EXPORT_SYMBOL(clear_page_dirty_for_io)
  int test_clear_page_writeback(struct page *page)
  {
        struct address_space *mapping = page_mapping(page);
 -      unsigned long memcg_flags;
        struct mem_cgroup *memcg;
 -      bool locked;
        int ret;
  
 -      memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 +      memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-               struct backing_dev_info *bdi = mapping->backing_dev_info;
+               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
                unsigned long flags;
  
                spin_lock_irqsave(&mapping->tree_lock, flags);
                dec_zone_page_state(page, NR_WRITEBACK);
                inc_zone_page_state(page, NR_WRITTEN);
        }
 -      mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
 +      mem_cgroup_end_page_stat(memcg);
        return ret;
  }
  
  int __test_set_page_writeback(struct page *page, bool keep_write)
  {
        struct address_space *mapping = page_mapping(page);
 -      unsigned long memcg_flags;
        struct mem_cgroup *memcg;
 -      bool locked;
        int ret;
  
 -      memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 +      memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-               struct backing_dev_info *bdi = mapping->backing_dev_info;
+               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
                unsigned long flags;
  
                spin_lock_irqsave(&mapping->tree_lock, flags);
                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
                inc_zone_page_state(page, NR_WRITEBACK);
        }
 -      mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
 +      mem_cgroup_end_page_stat(memcg);
        return ret;
  
  }
@@@ -2405,12 -2408,7 +2407,7 @@@ EXPORT_SYMBOL(mapping_tagged)
   */
  void wait_for_stable_page(struct page *page)
  {
-       struct address_space *mapping = page_mapping(page);
-       struct backing_dev_info *bdi = mapping->backing_dev_info;
-       if (!bdi_cap_stable_pages_required(bdi))
-               return;
-       wait_on_page_writeback(page);
+       if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+               wait_on_page_writeback(page);
  }
  EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --combined mm/shmem.c
index 864c878401e66add011beabd5931e5cd852dd6c9,4c61d3d5bfb41f5bbea9964bd49b87551ea63b45..a63031fa3e0c1e4380e6937aa711df912c9a687f
@@@ -191,11 -191,6 +191,6 @@@ static const struct inode_operations sh
  static const struct inode_operations shmem_special_inode_operations;
  static const struct vm_operations_struct shmem_vm_ops;
  
- static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
-       .ra_pages       = 0,    /* No readahead */
-       .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
- };
  static LIST_HEAD(shmem_swaplist);
  static DEFINE_MUTEX(shmem_swaplist_mutex);
  
@@@ -765,11 -760,11 +760,11 @@@ static int shmem_writepage(struct page 
                goto redirty;
  
        /*
-        * shmem_backing_dev_info's capabilities prevent regular writeback or
-        * sync from ever calling shmem_writepage; but a stacking filesystem
-        * might use ->writepage of its underlying filesystem, in which case
-        * tmpfs should write out to swap only in response to memory pressure,
-        * and not for the writeback threads or sync.
+        * Our capabilities prevent regular writeback or sync from ever calling
+        * shmem_writepage; but a stacking filesystem might use ->writepage of
+        * its underlying filesystem, in which case tmpfs should write out to
+        * swap only in response to memory pressure, and not for the writeback
+        * threads or sync.
         */
        if (!wbc->for_reclaim) {
                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
@@@ -1013,7 -1008,7 +1008,7 @@@ static int shmem_replace_page(struct pa
                 */
                oldpage = newpage;
        } else {
 -              mem_cgroup_migrate(oldpage, newpage, false);
 +              mem_cgroup_migrate(oldpage, newpage, true);
                lru_cache_add_anon(newpage);
                *pagep = newpage;
        }
@@@ -1131,7 -1126,7 +1126,7 @@@ repeat
                         * truncated or holepunched since swap was confirmed.
                         * shmem_undo_range() will have done some of the
                         * unaccounting, now delete_from_swap_cache() will do
 -                       * the rest (including mem_cgroup_uncharge_swapcache).
 +                       * the rest.
                         * Reset swap.val? No, leave it so "failed" goes back to
                         * "repeat": reading a hole and writing should succeed.
                         */
@@@ -1415,7 -1410,6 +1410,6 @@@ static struct inode *shmem_get_inode(st
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_blocks = 0;
-               inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_generation = get_seconds();
                info = SHMEM_I(inode);
  
  bool shmem_mapping(struct address_space *mapping)
  {
-       return mapping->backing_dev_info == &shmem_backing_dev_info;
+       return mapping->host->i_sb->s_op == &shmem_ops;
  }
  
  #ifdef CONFIG_TMPFS
@@@ -3201,6 -3195,7 +3195,6 @@@ static const struct vm_operations_struc
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
  #endif
 -      .remap_pages    = generic_file_remap_pages,
  };
  
  static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@@ -3225,10 -3220,6 +3219,6 @@@ int __init shmem_init(void
        if (shmem_inode_cachep)
                return 0;
  
-       error = bdi_init(&shmem_backing_dev_info);
-       if (error)
-               goto out4;
        error = shmem_init_inodecache();
        if (error)
                goto out3;
@@@ -3252,8 -3243,6 +3242,6 @@@ out1
  out2:
        shmem_destroy_inodecache();
  out3:
-       bdi_destroy(&shmem_backing_dev_info);
- out4:
        shm_mnt = ERR_PTR(error);
        return error;
  }
diff --combined mm/swap.c
index 5b3087228b99c2b1ac0c5dba9378855860a34cda,4e0109a2f37b75f8afaf0d4c1d3de3f1f5687b06..cd3a5e64cea9be1f1b1759f056c35c0bf3ad2811
+++ b/mm/swap.c
@@@ -1138,10 -1138,10 +1138,8 @@@ void __init swap_setup(void
  #ifdef CONFIG_SWAP
        int i;
  
-       if (bdi_init(swapper_spaces[0].backing_dev_info))
-               panic("Failed to init swap bdi");
 -      for (i = 0; i < MAX_SWAPFILES; i++) {
 +      for (i = 0; i < MAX_SWAPFILES; i++)
                spin_lock_init(&swapper_spaces[i].tree_lock);
 -              INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
 -      }
  #endif
  
        /* Use a smaller cluster for small-memory machines */
diff --combined mm/vmscan.c
index 8e645ee520455432e583deb2614199d84e422066,e00a16393f21e18c3ce156dbdd2777e15ba7e7f8..224dd298fdcd342b3b26d8046bc6bcf89b60475c
@@@ -91,9 -91,6 +91,9 @@@ struct scan_control 
        /* Can pages be swapped as part of reclaim? */
        unsigned int may_swap:1;
  
 +      /* Can cgroups be reclaimed below their normal consumption range? */
 +      unsigned int may_thrash:1;
 +
        unsigned int hibernation_mode:1;
  
        /* One of the zones is ready for compaction */
@@@ -500,7 -497,7 +500,7 @@@ static pageout_t pageout(struct page *p
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-       if (!may_write_to_queue(mapping->backing_dev_info, sc))
+       if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
                return PAGE_KEEP;
  
        if (clear_page_dirty_for_io(page)) {
@@@ -879,7 -876,7 +879,7 @@@ static unsigned long shrink_page_list(s
                 */
                mapping = page_mapping(page);
                if (((dirty || writeback) && mapping &&
-                    bdi_write_congested(mapping->backing_dev_info)) ||
+                    bdi_write_congested(inode_to_bdi(mapping->host))) ||
                    (writeback && PageReclaim(page)))
                        nr_congested++;
  
@@@ -1906,12 -1903,8 +1906,12 @@@ static void get_scan_count(struct lruve
         * latencies, so it's better to scan a minimum amount there as
         * well.
         */
 -      if (current_is_kswapd() && !zone_reclaimable(zone))
 -              force_scan = true;
 +      if (current_is_kswapd()) {
 +              if (!zone_reclaimable(zone))
 +                      force_scan = true;
 +              if (!mem_cgroup_lruvec_online(lruvec))
 +                      force_scan = true;
 +      }
        if (!global_reclaim(sc))
                force_scan = true;
  
@@@ -2297,12 -2290,6 +2297,12 @@@ static bool shrink_zone(struct zone *zo
                        struct lruvec *lruvec;
                        int swappiness;
  
 +                      if (mem_cgroup_low(root, memcg)) {
 +                              if (!sc->may_thrash)
 +                                      continue;
 +                              mem_cgroup_events(memcg, MEMCG_LOW, 1);
 +                      }
 +
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
                        swappiness = mem_cgroup_swappiness(memcg);
  
                                mem_cgroup_iter_break(root, memcg);
                                break;
                        }
 -                      memcg = mem_cgroup_iter(root, memcg, &reclaim);
 -              } while (memcg);
 +              } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
  
                /*
                 * Shrink the slab caches in the same proportion that
@@@ -2527,11 -2515,10 +2527,11 @@@ static bool shrink_zones(struct zonelis
  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                          struct scan_control *sc)
  {
 +      int initial_priority = sc->priority;
        unsigned long total_scanned = 0;
        unsigned long writeback_threshold;
        bool zones_reclaimable;
 -
 +retry:
        delayacct_freepages_start();
  
        if (global_reclaim(sc))
        if (sc->compaction_ready)
                return 1;
  
 +      /* Untapped cgroup reserves?  Don't OOM, retry. */
 +      if (!sc->may_thrash) {
 +              sc->priority = initial_priority;
 +              sc->may_thrash = 1;
 +              goto retry;
 +      }
 +
        /* Any of the zones still reclaimable?  Don't OOM. */
        if (zones_reclaimable)
                return 1;
@@@ -2676,7 -2656,7 +2676,7 @@@ static bool throttle_direct_reclaim(gfp
         * should make reasonable progress.
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
 -                                      gfp_mask, nodemask) {
 +                                      gfp_zone(gfp_mask), nodemask) {
                if (zone_idx(zone) > ZONE_NORMAL)
                        continue;
  
@@@ -3195,7 -3175,7 +3195,7 @@@ static unsigned long balance_pgdat(pg_d
                 */
                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
                                pfmemalloc_watermark_ok(pgdat))
 -                      wake_up(&pgdat->pfmemalloc_wait);
 +                      wake_up_all(&pgdat->pfmemalloc_wait);
  
                /*
                 * Fragmentation may mean that the system cannot be rebalanced