Merge branch 'for-linus-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 18 Jan 2016 20:44:40 +0000 (12:44 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 18 Jan 2016 20:44:40 +0000 (12:44 -0800)
Pull btrfs updates from Chris Mason:
 "This has our usual assortment of fixes and cleanups, but the biggest
  change included is Omar Sandoval's free space tree.  It's not the
  default yet, mounting -o space_cache=v2 enables it and sets a readonly
  compat bit.  The tree can actually be deleted and regenerated if there
  are any problems, but it has held up really well in testing so far.

  For very large filesystems (30T+) our existing free space caching code
  can end up taking a huge amount of time during commits.  The new tree
  based code is faster and less work overall to update as the commit
  progresses.

  Omar worked on this during the summer and we'll hammer on it in
  production here at FB over the next few months"

* 'for-linus-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (73 commits)
  Btrfs: fix fitrim discarding device area reserved for boot loader's use
  Btrfs: Check metadata redundancy on balance
  btrfs: statfs: report zero available if metadata are exhausted
  btrfs: preallocate path for snapshot creation at ioctl time
  btrfs: allocate root item at snapshot ioctl time
  btrfs: do an allocation earlier during snapshot creation
  btrfs: use smaller type for btrfs_path locks
  btrfs: use smaller type for btrfs_path lowest_level
  btrfs: use smaller type for btrfs_path reada
  btrfs: cleanup, use enum values for btrfs_path reada
  btrfs: constify static arrays
  btrfs: constify remaining structs with function pointers
  btrfs tests: replace whole ops structure for free space tests
  btrfs: use list_for_each_entry* in backref.c
  btrfs: use list_for_each_entry_safe in free-space-cache.c
  btrfs: use list_for_each_entry* in check-integrity.c
  Btrfs: use linux/sizes.h to represent constants
  btrfs: cleanup, remove stray return statements
  btrfs: zero out delayed node upon allocation
  btrfs: pass proper enum type to start_transaction()
  ...

1  2 
fs/btrfs/acl.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/super.c
fs/btrfs/xattr.c

diff --combined fs/btrfs/acl.c
index f89db0c21b51edaadda1d1544425010d89e92ae2,dbbb8ed53a518a0ac3fa0207afe8d9041e7afba2..6d263bb1621cd92c51cc6ebd014f8ad06dcd719f
@@@ -37,10 -37,10 +37,10 @@@ struct posix_acl *btrfs_get_acl(struct 
  
        switch (type) {
        case ACL_TYPE_ACCESS:
 -              name = POSIX_ACL_XATTR_ACCESS;
 +              name = XATTR_NAME_POSIX_ACL_ACCESS;
                break;
        case ACL_TYPE_DEFAULT:
 -              name = POSIX_ACL_XATTR_DEFAULT;
 +              name = XATTR_NAME_POSIX_ACL_DEFAULT;
                break;
        default:
                BUG();
@@@ -48,7 -48,7 +48,7 @@@
  
        size = __btrfs_getxattr(inode, name, "", 0);
        if (size > 0) {
-               value = kzalloc(size, GFP_NOFS);
+               value = kzalloc(size, GFP_KERNEL);
                if (!value)
                        return ERR_PTR(-ENOMEM);
                size = __btrfs_getxattr(inode, name, value, size);
@@@ -81,7 -81,7 +81,7 @@@ static int __btrfs_set_acl(struct btrfs
  
        switch (type) {
        case ACL_TYPE_ACCESS:
 -              name = POSIX_ACL_XATTR_ACCESS;
 +              name = XATTR_NAME_POSIX_ACL_ACCESS;
                if (acl) {
                        ret = posix_acl_equiv_mode(acl, &inode->i_mode);
                        if (ret < 0)
@@@ -94,7 -94,7 +94,7 @@@
        case ACL_TYPE_DEFAULT:
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EINVAL : 0;
 -              name = POSIX_ACL_XATTR_DEFAULT;
 +              name = XATTR_NAME_POSIX_ACL_DEFAULT;
                break;
        default:
                return -EINVAL;
  
        if (acl) {
                size = posix_acl_xattr_size(acl->a_count);
-               value = kmalloc(size, GFP_NOFS);
+               value = kmalloc(size, GFP_KERNEL);
                if (!value) {
                        ret = -ENOMEM;
                        goto out;
diff --combined fs/btrfs/ctree.h
index b7e4e344e8e0a510697a56bf553d833c5bcbeb06,c5f40dc1f74fc06508ed794bde57e67b6fcd09c8..97ad9bbeb35d24ec0ad228aa6f040b61bffb33a5
@@@ -35,6 -35,7 +35,7 @@@
  #include <linux/btrfs.h>
  #include <linux/workqueue.h>
  #include <linux/security.h>
+ #include <linux/sizes.h>
  #include "extent_io.h"
  #include "extent_map.h"
  #include "async-thread.h"
@@@ -96,6 -97,9 +97,9 @@@ struct btrfs_ordered_sum
  /* for storing items that use the BTRFS_UUID_KEY* types */
  #define BTRFS_UUID_TREE_OBJECTID 9ULL
  
+ /* tracks free space in block groups. */
+ #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
  /* for storing balance parameters in the root tree */
  #define BTRFS_BALANCE_OBJECTID -4ULL
  
  /* csum types */
  #define BTRFS_CSUM_TYPE_CRC32 0
  
- static int btrfs_csum_sizes[] = { 4 };
+ static const int btrfs_csum_sizes[] = { 4 };
  
  /* four bytes for CRC32 */
  #define BTRFS_EMPTY_DIR_SIZE 0
  /* ioprio of readahead is set to idle */
  #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
  
- #define BTRFS_DIRTY_METADATA_THRESH   (32 * 1024 * 1024)
+ #define BTRFS_DIRTY_METADATA_THRESH   SZ_32M
  
- #define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+ #define BTRFS_MAX_EXTENT_SIZE SZ_128M
  
  /*
   * The key defines the order in the tree, and so it also defines (optimal)
@@@ -500,6 -504,8 +504,8 @@@ struct btrfs_super_block 
   * Compat flags that we support.  If any incompat flags are set other than the
   * ones specified below then we will fail to mount
   */
+ #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE       (1ULL << 0)
  #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF  (1ULL << 0)
  #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
  #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS   (1ULL << 2)
  #define BTRFS_FEATURE_COMPAT_SUPP             0ULL
  #define BTRFS_FEATURE_COMPAT_SAFE_SET         0ULL
  #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR               0ULL
- #define BTRFS_FEATURE_COMPAT_RO_SUPP          0ULL
+ #define BTRFS_FEATURE_COMPAT_RO_SUPP                  \
+       (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET      0ULL
  #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR    0ULL
  
@@@ -590,14 -599,15 +599,15 @@@ struct btrfs_node 
   * The slots array records the index of the item or block pointer
   * used while walking the tree.
   */
+ enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
  struct btrfs_path {
        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
        int slots[BTRFS_MAX_LEVEL];
        /* if there is real range locking, this locks field will change */
-       int locks[BTRFS_MAX_LEVEL];
-       int reada;
+       u8 locks[BTRFS_MAX_LEVEL];
+       u8 reada;
        /* keep some upper locks as we walk down */
-       int lowest_level;
+       u8 lowest_level;
  
        /*
         * set by btrfs_split_item, tells search_slot to keep all locks
@@@ -1088,6 -1098,13 +1098,13 @@@ struct btrfs_block_group_item 
        __le64 flags;
  } __attribute__ ((__packed__));
  
+ struct btrfs_free_space_info {
+       __le32 extent_count;
+       __le32 flags;
+ } __attribute__ ((__packed__));
+ #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
  #define BTRFS_QGROUP_LEVEL_SHIFT              48
  static inline u64 btrfs_qgroup_level(u64 qgroupid)
  {
@@@ -1296,6 -1313,9 +1313,9 @@@ struct btrfs_caching_control 
        atomic_t count;
  };
  
+ /* Once caching_thread() finds this much free space, it will wake up waiters. */
+ #define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
  struct btrfs_io_ctl {
        void *cur, *orig;
        struct page *page;
@@@ -1321,8 -1341,20 +1341,20 @@@ struct btrfs_block_group_cache 
        u64 delalloc_bytes;
        u64 bytes_super;
        u64 flags;
-       u64 sectorsize;
        u64 cache_generation;
+       u32 sectorsize;
+       /*
+        * If the free space extent count exceeds this number, convert the block
+        * group to bitmaps.
+        */
+       u32 bitmap_high_thresh;
+       /*
+        * If the free space extent count drops below this number, convert the
+        * block group back to extents.
+        */
+       u32 bitmap_low_thresh;
  
        /*
         * It is just used for the delayed data space allocation because
        struct list_head io_list;
  
        struct btrfs_io_ctl io_ctl;
+       /* Lock for free space tree operations. */
+       struct mutex free_space_lock;
+       /*
+        * Does the block group need to be added to the free space tree?
+        * Protected by free_space_lock.
+        */
+       int needs_free_space;
  };
  
  /* delayed seq elem */
@@@ -1429,6 -1470,7 +1470,7 @@@ struct btrfs_fs_info 
        struct btrfs_root *csum_root;
        struct btrfs_root *quota_root;
        struct btrfs_root *uuid_root;
+       struct btrfs_root *free_space_root;
  
        /* the log root tree is a directory of all the other log roots */
        struct btrfs_root *log_root_tree;
         * and will be latter freed. Protected by fs_info->chunk_mutex.
         */
        struct list_head pinned_chunks;
+       int creating_free_space_tree;
  };
  
  struct btrfs_subvolume_writers {
@@@ -2092,6 -2136,27 +2136,27 @@@ struct btrfs_ioctl_defrag_range_args 
   */
  #define BTRFS_BLOCK_GROUP_ITEM_KEY 192
  
+ /*
+  * Every block group is represented in the free space tree by a free space info
+  * item, which stores some accounting information. It is keyed on
+  * (block_group_start, FREE_SPACE_INFO, block_group_length).
+  */
+ #define BTRFS_FREE_SPACE_INFO_KEY 198
+ /*
+  * A free space extent tracks an extent of space that is free in a block group.
+  * It is keyed on (start, FREE_SPACE_EXTENT, length).
+  */
+ #define BTRFS_FREE_SPACE_EXTENT_KEY 199
+ /*
+  * When a block group becomes very fragmented, we convert it to use bitmaps
+  * instead of extents. A free space bitmap is keyed on
+  * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+  * (length / sectorsize) bits.
+  */
+ #define BTRFS_FREE_SPACE_BITMAP_KEY 200
  #define BTRFS_DEV_EXTENT_KEY  204
  #define BTRFS_DEV_ITEM_KEY    216
  #define BTRFS_CHUNK_ITEM_KEY  228
  #define BTRFS_MOUNT_RESCAN_UUID_TREE  (1 << 23)
  #define BTRFS_MOUNT_FRAGMENT_DATA     (1 << 24)
  #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
+ #define BTRFS_MOUNT_FREE_SPACE_TREE   (1 << 26)
  
  #define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
  #define BTRFS_DEFAULT_MAX_INLINE      (8192)
@@@ -2506,6 -2572,11 +2572,11 @@@ BTRFS_SETGET_FUNCS(disk_block_group_fla
  BTRFS_SETGET_STACK_FUNCS(block_group_flags,
                        struct btrfs_block_group_item, flags, 64);
  
+ /* struct btrfs_free_space_info */
+ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+                  extent_count, 32);
+ BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
  /* struct btrfs_inode_ref */
  BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
  BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@@ -3573,6 -3644,9 +3644,9 @@@ void btrfs_end_write_no_snapshoting(str
  void check_system_chunk(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        const u64 type);
+ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+                      struct btrfs_fs_info *info, u64 start, u64 end);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@@ -3737,6 -3811,7 +3811,7 @@@ static inline void free_fs_info(struct 
        kfree(fs_info->csum_root);
        kfree(fs_info->quota_root);
        kfree(fs_info->uuid_root);
+       kfree(fs_info->free_space_root);
        kfree(fs_info->super_copy);
        kfree(fs_info->super_for_commit);
        security_free_mnt_opts(&fs_info->security_opts);
@@@ -3906,7 -3981,6 +3981,6 @@@ void btrfs_extent_item_to_extent_map(st
  /* inode.c */
  struct btrfs_delalloc_work {
        struct inode *inode;
-       int wait;
        int delay_iput;
        struct completion completion;
        struct list_head list;
  };
  
  struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-                                                   int wait, int delay_iput);
+                                                   int delay_iput);
  void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
  
  struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
@@@ -4024,8 -4098,7 +4098,8 @@@ void btrfs_get_block_group_info(struct 
                                struct btrfs_ioctl_space_info *space);
  void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
                               struct btrfs_ioctl_balance_args *bargs);
 -
 +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 +                         struct file *dst_file, u64 dst_loff);
  
  /* file.c */
  int btrfs_auto_defrag_init(void);
@@@ -4056,11 -4129,6 +4130,11 @@@ int btrfs_dirty_pages(struct btrfs_roo
                      loff_t pos, size_t write_bytes,
                      struct extent_state **cached);
  int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
 +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
 +                            struct file *file_out, loff_t pos_out,
 +                            size_t len, unsigned int flags);
 +int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
 +                         struct file *file_out, loff_t pos_out, u64 len);
  
  /* tree-defrag.c */
  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@@ -4253,16 -4321,98 +4327,98 @@@ static inline void __btrfs_set_fs_incom
        }
  }
  
+ #define btrfs_clear_fs_incompat(__fs_info, opt) \
+       __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+                                            u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_incompat_flags(disk_super);
+       if (features & flag) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_incompat_flags(disk_super);
+               if (features & flag) {
+                       features &= ~flag;
+                       btrfs_set_super_incompat_flags(disk_super, features);
+                       btrfs_info(fs_info, "clearing %llu feature flag",
+                                        flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
  #define btrfs_fs_incompat(fs_info, opt) \
        __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
  
- static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
  {
        struct btrfs_super_block *disk_super;
        disk_super = fs_info->super_copy;
        return !!(btrfs_super_incompat_flags(disk_super) & flag);
  }
  
+ #define btrfs_set_fs_compat_ro(__fs_info, opt) \
+       __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+                                           u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_compat_ro_flags(disk_super);
+       if (!(features & flag)) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_compat_ro_flags(disk_super);
+               if (!(features & flag)) {
+                       features |= flag;
+                       btrfs_set_super_compat_ro_flags(disk_super, features);
+                       btrfs_info(fs_info, "setting %llu ro feature flag",
+                                  flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
+ #define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+       __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+                                             u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       u64 features;
+       disk_super = fs_info->super_copy;
+       features = btrfs_super_compat_ro_flags(disk_super);
+       if (features & flag) {
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_compat_ro_flags(disk_super);
+               if (features & flag) {
+                       features &= ~flag;
+                       btrfs_set_super_compat_ro_flags(disk_super, features);
+                       btrfs_info(fs_info, "clearing %llu ro feature flag",
+                                  flag);
+               }
+               spin_unlock(&fs_info->super_lock);
+       }
+ }
+ #define btrfs_fs_compat_ro(fs_info, opt) \
+       __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+ {
+       struct btrfs_super_block *disk_super;
+       disk_super = fs_info->super_copy;
+       return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+ }
  /*
   * Call btrfs_abort_transaction as early as possible when an error condition is
   * detected, that way the exact line number is reported.
diff --combined fs/btrfs/disk-io.c
index 42a378a4eefb4cd198c0d328eecaec293735f3af,c67c129fe99a537aa1ec72d76ef111109078c776..e99ccd6ffb2c14f58bf38f548e202f0e0f86e9ea
@@@ -42,6 -42,7 +42,7 @@@
  #include "locking.h"
  #include "tree-log.h"
  #include "free-space-cache.h"
+ #include "free-space-tree.h"
  #include "inode-map.h"
  #include "check-integrity.h"
  #include "rcu-string.h"
@@@ -362,7 -363,7 +363,7 @@@ static int verify_parent_transid(struc
        }
  
        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
-                        0, &cached_state);
+                        &cached_state);
        if (extent_buffer_uptodate(eb) &&
            btrfs_header_generation(eb) == parent_transid) {
                ret = 0;
@@@ -923,7 -924,7 +924,7 @@@ static int check_async_write(struct ino
        if (bio_flags & EXTENT_BIO_TREE_LOG)
                return 0;
  #ifdef CONFIG_X86
 -      if (cpu_has_xmm4_2)
 +      if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
                return 0;
  #endif
        return 1;
@@@ -1650,6 -1651,9 +1651,9 @@@ struct btrfs_root *btrfs_get_fs_root(st
        if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
                return fs_info->uuid_root ? fs_info->uuid_root :
                                            ERR_PTR(-ENOENT);
+       if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+               return fs_info->free_space_root ? fs_info->free_space_root :
+                                                 ERR_PTR(-ENOENT);
  again:
        root = btrfs_lookup_fs_root(fs_info, location->objectid);
        if (root) {
@@@ -2148,6 -2152,7 +2152,7 @@@ static void free_root_pointers(struct b
        free_root_extent_buffers(info->uuid_root);
        if (chunk_root)
                free_root_extent_buffers(info->chunk_root);
+       free_root_extent_buffers(info->free_space_root);
  }
  
  void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@@ -2448,6 -2453,15 +2453,15 @@@ static int btrfs_read_roots(struct btrf
                fs_info->uuid_root = root;
        }
  
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+               root = btrfs_read_tree_root(tree_root, &location);
+               if (IS_ERR(root))
+                       return PTR_ERR(root);
+               set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+               fs_info->free_space_root = root;
+       }
        return 0;
  }
  
@@@ -2668,6 -2682,7 +2682,7 @@@ int open_ctree(struct super_block *sb
        if (btrfs_check_super_csum(bh->b_data)) {
                printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
                err = -EINVAL;
+               brelse(bh);
                goto fail_alloc;
        }
  
  
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
-                                   4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+                                   SZ_4M / PAGE_CACHE_SIZE);
  
        tree_root->nodesize = nodesize;
        tree_root->sectorsize = sectorsize;
@@@ -3051,6 -3066,18 +3066,18 @@@ retry_root_backup
        if (sb->s_flags & MS_RDONLY)
                return 0;
  
+       if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+           !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               pr_info("BTRFS: creating free space tree\n");
+               ret = btrfs_create_free_space_tree(fs_info);
+               if (ret) {
+                       pr_warn("BTRFS: failed to create free space tree %d\n",
+                               ret);
+                       close_ctree(tree_root);
+                       return ret;
+               }
+       }
        down_read(&fs_info->cleanup_work_sem);
        if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
            (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
  
        btrfs_qgroup_rescan_resume(fs_info);
  
+       if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+           btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+               pr_info("BTRFS: clearing free space tree\n");
+               ret = btrfs_clear_free_space_tree(fs_info);
+               if (ret) {
+                       pr_warn("BTRFS: failed to clear free space tree %d\n",
+                               ret);
+                       close_ctree(tree_root);
+                       return ret;
+               }
+       }
        if (!fs_info->uuid_root) {
                pr_info("BTRFS: creating UUID tree\n");
                ret = btrfs_create_uuid_tree(fs_info);
@@@ -3902,11 -3941,6 +3941,6 @@@ int btrfs_buffer_uptodate(struct extent
        return !ret;
  }
  
- int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
- {
-       return set_extent_buffer_uptodate(buf);
- }
  void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
  {
        struct btrfs_root *root;
@@@ -3962,7 -3996,6 +3996,6 @@@ static void __btrfs_btree_balance_dirty
                balance_dirty_pages_ratelimited(
                                   root->fs_info->btree_inode->i_mapping);
        }
-       return;
  }
  
  void btrfs_btree_balance_dirty(struct btrfs_root *root)
diff --combined fs/btrfs/file.c
index e3d9022bfd4e3c2861008104d13e55050aa91f13,364e0f1f61f68ab4e32b83bff9e11465295d0d05..83d7859d76199d96ec882c35c87dea98526d229e
@@@ -1394,7 -1394,7 +1394,7 @@@ lock_and_cleanup_extent_if_need(struct 
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                                start_pos, last_pos, 0, cached_state);
+                                start_pos, last_pos, cached_state);
                ordered = btrfs_lookup_ordered_range(inode, start_pos,
                                                     last_pos - start_pos + 1);
                if (ordered &&
@@@ -2398,7 -2398,7 +2398,7 @@@ static int btrfs_punch_hole(struct inod
                truncate_pagecache_range(inode, lockstart, lockend);
  
                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                0, &cached_state);
+                                &cached_state);
                ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
  
                /*
@@@ -2705,7 -2705,7 +2705,7 @@@ static long btrfs_fallocate(struct fil
                 * transaction
                 */
                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-                                locked_end, 0, &cached_state);
+                                locked_end, &cached_state);
                ordered = btrfs_lookup_first_ordered_extent(inode,
                                                            alloc_end - 1);
                if (ordered &&
@@@ -2852,7 -2852,7 +2852,7 @@@ static int find_desired_extent(struct i
        lockend--;
        len = lockend - lockstart + 1;
  
-       lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+       lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         &cached_state);
  
        while (start < inode->i_size) {
@@@ -2934,9 -2934,6 +2934,9 @@@ const struct file_operations btrfs_file
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
  #endif
 +      .copy_file_range = btrfs_copy_file_range,
 +      .clone_file_range = btrfs_clone_file_range,
 +      .dedupe_file_range = btrfs_dedupe_file_range,
  };
  
  void btrfs_auto_defrag_exit(void)
diff --combined fs/btrfs/inode.c
index 394017831692beb5a59ff7b2c747c0a824e188ce,85afe66955cf395611892e31a31ca05245146f69..24783010768680bea28f4f0e5e9aaa2c6a62a41c
@@@ -66,6 -66,13 +66,13 @@@ struct btrfs_iget_args 
        struct btrfs_root *root;
  };
  
+ struct btrfs_dio_data {
+       u64 outstanding_extents;
+       u64 reserve;
+       u64 unsubmitted_oe_range_start;
+       u64 unsubmitted_oe_range_end;
+ };
  static const struct inode_operations btrfs_dir_inode_operations;
  static const struct inode_operations btrfs_symlink_inode_operations;
  static const struct inode_operations btrfs_dir_ro_inode_operations;
@@@ -74,17 -81,16 +81,16 @@@ static const struct inode_operations bt
  static const struct address_space_operations btrfs_aops;
  static const struct address_space_operations btrfs_symlink_aops;
  static const struct file_operations btrfs_dir_file_operations;
- static struct extent_io_ops btrfs_extent_io_ops;
+ static const struct extent_io_ops btrfs_extent_io_ops;
  
  static struct kmem_cache *btrfs_inode_cachep;
- static struct kmem_cache *btrfs_delalloc_work_cachep;
  struct kmem_cache *btrfs_trans_handle_cachep;
  struct kmem_cache *btrfs_transaction_cachep;
  struct kmem_cache *btrfs_path_cachep;
  struct kmem_cache *btrfs_free_space_cachep;
  
  #define S_SHIFT 12
- static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
        [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
        [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
@@@ -414,15 -420,15 +420,15 @@@ static noinline void compress_file_rang
        unsigned long nr_pages_ret = 0;
        unsigned long total_compressed = 0;
        unsigned long total_in = 0;
-       unsigned long max_compressed = 128 * 1024;
-       unsigned long max_uncompressed = 128 * 1024;
+       unsigned long max_compressed = SZ_128K;
+       unsigned long max_uncompressed = SZ_128K;
        int i;
        int will_compress;
        int compress_type = root->fs_info->compress_type;
        int redirty = 0;
  
        /* if this is a small write inside eof, kick off a defrag */
-       if ((end - start + 1) < 16 * 1024 &&
+       if ((end - start + 1) < SZ_16K &&
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                btrfs_add_inode_defrag(NULL, inode);
  
  again:
        will_compress = 0;
        nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
-       nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+       nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
  
        /*
         * we don't want to send crud past the end of i_size through
@@@ -944,7 -950,7 +950,7 @@@ static noinline int cow_file_range(stru
        disk_num_bytes = num_bytes;
  
        /* if this is a small write inside eof, kick off defrag */
-       if (num_bytes < 64 * 1024 &&
+       if (num_bytes < SZ_64K &&
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                btrfs_add_inode_defrag(NULL, inode);
  
@@@ -1107,7 -1113,7 +1113,7 @@@ static noinline void async_cow_submit(s
         * atomic_sub_return implies a barrier for waitqueue_active
         */
        if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
-           5 * 1024 * 1024 &&
+           5 * SZ_1M &&
            waitqueue_active(&root->fs_info->async_submit_wait))
                wake_up(&root->fs_info->async_submit_wait);
  
@@@ -1132,7 -1138,7 +1138,7 @@@ static int cow_file_range_async(struct 
        struct btrfs_root *root = BTRFS_I(inode)->root;
        unsigned long nr_pages;
        u64 cur_end;
-       int limit = 10 * 1024 * 1024;
+       int limit = 10 * SZ_1M;
  
        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
                         1, 0, NULL, GFP_NOFS);
                    !btrfs_test_opt(root, FORCE_COMPRESS))
                        cur_end = end;
                else
-                       cur_end = min(end, start + 512 * 1024 - 1);
+                       cur_end = min(end, start + SZ_512K - 1);
  
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
@@@ -1989,7 -1995,7 +1995,7 @@@ again
        page_start = page_offset(page);
        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
  
-       lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+       lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
                         &cached_state);
  
        /* already ordered? We're done */
@@@ -2482,7 -2488,7 +2488,7 @@@ static noinline int relink_extent_backr
        lock_start = backref->file_pos;
        lock_end = backref->file_pos + backref->num_bytes - 1;
        lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
-                        0, &cached);
+                        &cached);
  
        ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
        if (ordered) {
@@@ -2874,7 -2880,7 +2880,7 @@@ static int btrfs_finish_ordered_io(stru
  
        lock_extent_bits(io_tree, ordered_extent->file_offset,
                         ordered_extent->file_offset + ordered_extent->len - 1,
-                        0, &cached_state);
+                        &cached_state);
  
        ret = test_range_bit(io_tree, ordered_extent->file_offset,
                        ordered_extent->file_offset + ordered_extent->len - 1,
@@@ -3106,55 -3112,47 +3112,47 @@@ static int btrfs_readpage_end_io_hook(s
                                      start, (size_t)(end - start + 1));
  }
  
- struct delayed_iput {
-       struct list_head list;
-       struct inode *inode;
- };
- /* JDM: If this is fs-wide, why can't we add a pointer to
-  * btrfs_inode instead and avoid the allocation? */
  void btrfs_add_delayed_iput(struct inode *inode)
  {
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
-       struct delayed_iput *delayed;
+       struct btrfs_inode *binode = BTRFS_I(inode);
  
        if (atomic_add_unless(&inode->i_count, -1, 1))
                return;
  
-       delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
-       delayed->inode = inode;
        spin_lock(&fs_info->delayed_iput_lock);
-       list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+       if (binode->delayed_iput_count == 0) {
+               ASSERT(list_empty(&binode->delayed_iput));
+               list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+       } else {
+               binode->delayed_iput_count++;
+       }
        spin_unlock(&fs_info->delayed_iput_lock);
  }
  
  void btrfs_run_delayed_iputs(struct btrfs_root *root)
  {
-       LIST_HEAD(list);
        struct btrfs_fs_info *fs_info = root->fs_info;
-       struct delayed_iput *delayed;
-       int empty;
-       spin_lock(&fs_info->delayed_iput_lock);
-       empty = list_empty(&fs_info->delayed_iputs);
-       spin_unlock(&fs_info->delayed_iput_lock);
-       if (empty)
-               return;
  
        down_read(&fs_info->delayed_iput_sem);
        spin_lock(&fs_info->delayed_iput_lock);
-       list_splice_init(&fs_info->delayed_iputs, &list);
-       spin_unlock(&fs_info->delayed_iput_lock);
-       while (!list_empty(&list)) {
-               delayed = list_entry(list.next, struct delayed_iput, list);
-               list_del(&delayed->list);
-               iput(delayed->inode);
-               kfree(delayed);
+       while (!list_empty(&fs_info->delayed_iputs)) {
+               struct btrfs_inode *inode;
+               inode = list_first_entry(&fs_info->delayed_iputs,
+                               struct btrfs_inode, delayed_iput);
+               if (inode->delayed_iput_count) {
+                       inode->delayed_iput_count--;
+                       list_move_tail(&inode->delayed_iput,
+                                       &fs_info->delayed_iputs);
+               } else {
+                       list_del_init(&inode->delayed_iput);
+               }
+               spin_unlock(&fs_info->delayed_iput_lock);
+               iput(&inode->vfs_inode);
+               spin_lock(&fs_info->delayed_iput_lock);
        }
+       spin_unlock(&fs_info->delayed_iput_lock);
        up_read(&root->fs_info->delayed_iput_sem);
  }
  
@@@ -3351,7 -3349,7 +3349,7 @@@ int btrfs_orphan_cleanup(struct btrfs_r
                ret = -ENOMEM;
                goto out;
        }
-       path->reada = -1;
+       path->reada = READA_BACK;
  
        key.objectid = BTRFS_ORPHAN_OBJECTID;
        key.type = BTRFS_ORPHAN_ITEM_KEY;
@@@ -3550,10 -3548,10 +3548,10 @@@ static noinline int acls_after_inode_it
        int scanned = 0;
  
        if (!xattr_access) {
 -              xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
 -                                      strlen(POSIX_ACL_XATTR_ACCESS));
 -              xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
 -                                      strlen(POSIX_ACL_XATTR_DEFAULT));
 +              xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
 +                                      strlen(XATTR_NAME_POSIX_ACL_ACCESS));
 +              xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
 +                                      strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
        }
  
        slot++;
@@@ -3774,7 -3772,6 +3772,7 @@@ cache_acl
                break;
        case S_IFLNK:
                inode->i_op = &btrfs_symlink_inode_operations;
 +              inode_nohighmem(inode);
                inode->i_mapping->a_ops = &btrfs_symlink_aops;
                break;
        default:
@@@ -4318,7 -4315,7 +4316,7 @@@ int btrfs_truncate_inode_items(struct b
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-       path->reada = -1;
+       path->reada = READA_BACK;
  
        /*
         * We want to drop from the next block forward in case this new size is
@@@ -4349,7 -4346,7 +4347,7 @@@ search_again
         * up a huge file in a single leaf.  Most of the time that
         * bytes_deleted is > 0, it will be huge by the time we get here
         */
-       if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+       if (be_nice && bytes_deleted > SZ_32M) {
                if (btrfs_should_end_transaction(trans, root)) {
                        err = -EAGAIN;
                        goto error;
@@@ -4592,7 -4589,7 +4590,7 @@@ error
  
        btrfs_free_path(path);
  
-       if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+       if (be_nice && bytes_deleted > SZ_32M) {
                unsigned long updates = trans->delayed_ref_updates;
                if (updates) {
                        trans->delayed_ref_updates = 0;
@@@ -4669,7 -4666,7 +4667,7 @@@ again
        }
        wait_on_page_writeback(page);
  
-       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+       lock_extent_bits(io_tree, page_start, page_end, &cached_state);
        set_page_extent_mapped(page);
  
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@@ -4800,7 -4797,7 +4798,7 @@@ int btrfs_cont_expand(struct inode *ino
        while (1) {
                struct btrfs_ordered_extent *ordered;
  
-               lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+               lock_extent_bits(io_tree, hole_start, block_end - 1,
                                 &cached_state);
                ordered = btrfs_lookup_ordered_range(inode, hole_start,
                                                     block_end - hole_start);
@@@ -5112,7 -5109,7 +5110,7 @@@ static void evict_inode_truncate_pages(
                end = state->end;
                spin_unlock(&io_tree->lock);
  
-               lock_extent_bits(io_tree, start, end, 0, &cached_state);
+               lock_extent_bits(io_tree, start, end, &cached_state);
  
                /*
                 * If still has DELALLOC flag, the extent didn't reach disk,
@@@ -5305,7 -5302,6 +5303,6 @@@ void btrfs_evict_inode(struct inode *in
  no_delete:
        btrfs_remove_delayed_node(inode);
        clear_inode(inode);
-       return;
  }
  
  /*
@@@ -5754,7 -5750,7 +5751,7 @@@ static int btrfs_real_readdir(struct fi
        if (!path)
                return -ENOMEM;
  
-       path->reada = 1;
+       path->reada = READA_FORWARD;
  
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                INIT_LIST_HEAD(&ins_list);
@@@ -6482,7 -6478,7 +6479,7 @@@ out_unlock_inode
  static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
  {
-       struct btrfs_trans_handle *trans;
+       struct btrfs_trans_handle *trans = NULL;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = d_inode(old_dentry);
        u64 index;
        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
+               trans = NULL;
                goto fail;
        }
  
                btrfs_log_new_name(trans, inode, NULL, parent);
        }
  
-       btrfs_end_transaction(trans, root);
        btrfs_balance_delayed_items(root);
  fail:
+       if (trans)
+               btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@@ -6688,7 -6686,7 +6687,7 @@@ static int merge_extent_mapping(struct 
  }
  
  static noinline int uncompress_inline(struct btrfs_path *path,
-                                     struct inode *inode, struct page *page,
+                                     struct page *page,
                                      size_t pg_offset, u64 extent_offset,
                                      struct btrfs_file_extent_item *item)
  {
@@@ -6785,7 -6783,7 +6784,7 @@@ again
                 * Chances are we'll be called again, so go ahead and do
                 * readahead
                 */
-               path->reada = 1;
+               path->reada = READA_FORWARD;
        }
  
        ret = btrfs_lookup_file_extent(trans, root, path,
@@@ -6884,8 -6882,7 +6883,7 @@@ next
                if (create == 0 && !PageUptodate(page)) {
                        if (btrfs_file_extent_compression(leaf, item) !=
                            BTRFS_COMPRESS_NONE) {
-                               ret = uncompress_inline(path, inode, page,
-                                                       pg_offset,
+                               ret = uncompress_inline(path, page, pg_offset,
                                                        extent_offset, item);
                                if (ret) {
                                        err = ret;
@@@ -7381,7 -7378,7 +7379,7 @@@ static int lock_extent_direct(struct in
  
        while (1) {
                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                0, cached_state);
+                                cached_state);
                /*
                 * We're concerned with the entire range that we're going to be
                 * doing DIO to, so we need to make sure theres no ordered
                        btrfs_start_ordered_extent(inode, ordered, 1);
                        btrfs_put_ordered_extent(ordered);
                } else {
-                       /* Screw you mmap */
-                       ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
-                       if (ret)
-                               break;
-                       ret = filemap_fdatawait_range(inode->i_mapping,
-                                                     lockstart,
-                                                     lockend);
-                       if (ret)
-                               break;
                        /*
-                        * If we found a page that couldn't be invalidated just
-                        * fall back to buffered.
+                        * We could trigger writeback for this range (and wait
+                        * for it to complete) and then invalidate the pages for
+                        * this range (through invalidate_inode_pages2_range()),
+                        * but that can lead us to a deadlock with a concurrent
+                        * call to readpages() (a buffered read or a defrag call
+                        * triggered a readahead) on a page lock due to an
+                        * ordered dio extent we created before but did not have
+                        * yet a corresponding bio submitted (whence it can not
+                        * complete), which makes readpages() wait for that
+                        * ordered extent to complete while holding a lock on
+                        * that page.
                         */
-                       ret = invalidate_inode_pages2_range(inode->i_mapping,
-                                       lockstart >> PAGE_CACHE_SHIFT,
-                                       lockend >> PAGE_CACHE_SHIFT);
-                       if (ret)
-                               break;
+                       ret = -ENOTBLK;
+                       break;
                }
  
                cond_resched();
@@@ -7483,11 -7476,6 +7477,6 @@@ static struct extent_map *create_pinned
        return em;
  }
  
- struct btrfs_dio_data {
-       u64 outstanding_extents;
-       u64 reserve;
- };
  static void adjust_dio_outstanding_extents(struct inode *inode,
                                           struct btrfs_dio_data *dio_data,
                                           const u64 len)
@@@ -7671,6 -7659,7 +7660,7 @@@ unlock
                btrfs_free_reserved_data_space(inode, start, len);
                WARN_ON(dio_data->reserve < len);
                dio_data->reserve -= len;
+               dio_data->unsubmitted_oe_range_end = start + len;
                current->journal_info = dio_data;
        }
  
@@@ -7993,22 -7982,22 +7983,22 @@@ static void btrfs_endio_direct_read(str
        bio_put(bio);
  }
  
- static void btrfs_endio_direct_write(struct bio *bio)
+ static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+                                                   const u64 offset,
+                                                   const u64 bytes,
+                                                   const int uptodate)
  {
-       struct btrfs_dio_private *dip = bio->bi_private;
-       struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered = NULL;
-       u64 ordered_offset = dip->logical_offset;
-       u64 ordered_bytes = dip->bytes;
-       struct bio *dio_bio;
+       u64 ordered_offset = offset;
+       u64 ordered_bytes = bytes;
        int ret;
  
  again:
        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
                                                   &ordered_offset,
                                                   ordered_bytes,
-                                                  !bio->bi_error);
+                                                  uptodate);
        if (!ret)
                goto out_test;
  
@@@ -8021,13 -8010,22 +8011,22 @@@ out_test
         * our bio might span multiple ordered extents.  If we haven't
         * completed the accounting for the whole dio, go back and try again
         */
-       if (ordered_offset < dip->logical_offset + dip->bytes) {
-               ordered_bytes = dip->logical_offset + dip->bytes -
-                       ordered_offset;
+       if (ordered_offset < offset + bytes) {
+               ordered_bytes = offset + bytes - ordered_offset;
                ordered = NULL;
                goto again;
        }
-       dio_bio = dip->dio_bio;
+ }
+ static void btrfs_endio_direct_write(struct bio *bio)
+ {
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct bio *dio_bio = dip->dio_bio;
+       btrfs_endio_direct_write_update_ordered(dip->inode,
+                                               dip->logical_offset,
+                                               dip->bytes,
+                                               !bio->bi_error);
  
        kfree(dip);
  
@@@ -8335,6 -8333,21 +8334,21 @@@ static void btrfs_submit_direct(int rw
                dip->subio_endio = btrfs_subio_endio_read;
        }
  
+       /*
+        * Reset the range for unsubmitted ordered extents (to a 0 length range)
+        * even if we fail to submit a bio, because in such case we do the
+        * corresponding error handling below and it must not be done a second
+        * time by btrfs_direct_IO().
+        */
+       if (write) {
+               struct btrfs_dio_data *dio_data = current->journal_info;
+               dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+                       dip->bytes;
+               dio_data->unsubmitted_oe_range_start =
+                       dio_data->unsubmitted_oe_range_end;
+       }
        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
        if (!ret)
                return;
@@@ -8363,24 -8376,15 +8377,15 @@@ free_ordered
                dip = NULL;
                io_bio = NULL;
        } else {
-               if (write) {
-                       struct btrfs_ordered_extent *ordered;
-                       ordered = btrfs_lookup_ordered_extent(inode,
-                                                             file_offset);
-                       set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
-                       /*
-                        * Decrements our ref on the ordered extent and removes
-                        * the ordered extent from the inode's ordered tree,
-                        * doing all the proper resource cleanup such as for the
-                        * reserved space and waking up any waiters for this
-                        * ordered extent (through btrfs_remove_ordered_extent).
-                        */
-                       btrfs_finish_ordered_io(ordered);
-               } else {
+               if (write)
+                       btrfs_endio_direct_write_update_ordered(inode,
+                                               file_offset,
+                                               dio_bio->bi_iter.bi_size,
+                                               0);
+               else
                        unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
                              file_offset + dio_bio->bi_iter.bi_size - 1);
-               }
                dio_bio->bi_error = -EIO;
                /*
                 * Releases and cleans up our dio_bio, no need to bio_put()
@@@ -8480,6 -8484,8 +8485,8 @@@ static ssize_t btrfs_direct_IO(struct k
                 * originally calculated.  Abuse current->journal_info for this.
                 */
                dio_data.reserve = round_up(count, root->sectorsize);
+               dio_data.unsubmitted_oe_range_start = (u64)offset;
+               dio_data.unsubmitted_oe_range_end = (u64)offset;
                current->journal_info = &dio_data;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
                        if (dio_data.reserve)
                                btrfs_delalloc_release_space(inode, offset,
                                                             dio_data.reserve);
+                       /*
+                        * On error we might have left some ordered extents
+                        * without submitting corresponding bios for them, so
+                        * cleanup them up to avoid other tasks getting them
+                        * and waiting for them to complete forever.
+                        */
+                       if (dio_data.unsubmitted_oe_range_start <
+                           dio_data.unsubmitted_oe_range_end)
+                               btrfs_endio_direct_write_update_ordered(inode,
+                                       dio_data.unsubmitted_oe_range_start,
+                                       dio_data.unsubmitted_oe_range_end -
+                                       dio_data.unsubmitted_oe_range_start,
+                                       0);
                } else if (ret >= 0 && (size_t)ret < count)
                        btrfs_delalloc_release_space(inode, offset,
                                                     count - (size_t)ret);
@@@ -8535,15 -8554,28 +8555,28 @@@ int btrfs_readpage(struct file *file, s
  static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
  {
        struct extent_io_tree *tree;
+       struct inode *inode = page->mapping->host;
+       int ret;
  
        if (current->flags & PF_MEMALLOC) {
                redirty_page_for_writepage(wbc, page);
                unlock_page(page);
                return 0;
        }
+       /*
+        * If we are under memory pressure we will call this directly from the
+        * VM, we need to make sure we have the inode referenced for the ordered
+        * extent.  If not just return like we didn't do anything.
+        */
+       if (!igrab(inode)) {
+               redirty_page_for_writepage(wbc, page);
+               return AOP_WRITEPAGE_ACTIVATE;
+       }
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-       return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+       ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+       btrfs_add_delayed_iput(inode);
+       return ret;
  }
  
  static int btrfs_writepages(struct address_space *mapping,
@@@ -8615,7 -8647,7 +8648,7 @@@ static void btrfs_invalidatepage(struc
        }
  
        if (!inode_evicting)
-               lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+               lock_extent_bits(tree, page_start, page_end, &cached_state);
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
                /*
                btrfs_put_ordered_extent(ordered);
                if (!inode_evicting) {
                        cached_state = NULL;
-                       lock_extent_bits(tree, page_start, page_end, 0,
+                       lock_extent_bits(tree, page_start, page_end,
                                         &cached_state);
                }
        }
@@@ -8751,7 -8783,7 +8784,7 @@@ again
        }
        wait_on_page_writeback(page);
  
-       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+       lock_extent_bits(io_tree, page_start, page_end, &cached_state);
        set_page_extent_mapped(page);
  
        /*
@@@ -9025,6 -9057,7 +9058,7 @@@ struct inode *btrfs_alloc_inode(struct 
        ei->dir_index = 0;
        ei->last_unlink_trans = 0;
        ei->last_log_commit = 0;
+       ei->delayed_iput_count = 0;
  
        spin_lock_init(&ei->lock);
        ei->outstanding_extents = 0;
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
+       INIT_LIST_HEAD(&ei->delayed_iput);
        RB_CLEAR_NODE(&ei->rb_node);
  
        return inode;
@@@ -9153,16 -9187,13 +9188,14 @@@ void btrfs_destroy_cachep(void
                kmem_cache_destroy(btrfs_path_cachep);
        if (btrfs_free_space_cachep)
                kmem_cache_destroy(btrfs_free_space_cachep);
-       if (btrfs_delalloc_work_cachep)
-               kmem_cache_destroy(btrfs_delalloc_work_cachep);
  }
  
  int btrfs_init_cachep(void)
  {
        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
                        sizeof(struct btrfs_inode), 0,
 -                      SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
 +                      SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
 +                      init_once);
        if (!btrfs_inode_cachep)
                goto fail;
  
        if (!btrfs_free_space_cachep)
                goto fail;
  
-       btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
-                       sizeof(struct btrfs_delalloc_work), 0,
-                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-                       NULL);
-       if (!btrfs_delalloc_work_cachep)
-               goto fail;
        return 0;
  fail:
        btrfs_destroy_cachep();
@@@ -9420,14 -9444,10 +9446,10 @@@ static void btrfs_run_delalloc_work(str
        delalloc_work = container_of(work, struct btrfs_delalloc_work,
                                     work);
        inode = delalloc_work->inode;
-       if (delalloc_work->wait) {
-               btrfs_wait_ordered_range(inode, 0, (u64)-1);
-       } else {
+       filemap_flush(inode->i_mapping);
+       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                               &BTRFS_I(inode)->runtime_flags))
                filemap_flush(inode->i_mapping);
-               if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                            &BTRFS_I(inode)->runtime_flags))
-                       filemap_flush(inode->i_mapping);
-       }
  
        if (delalloc_work->delay_iput)
                btrfs_add_delayed_iput(inode);
  }
  
  struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-                                                   int wait, int delay_iput)
+                                                   int delay_iput)
  {
        struct btrfs_delalloc_work *work;
  
-       work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+       work = kmalloc(sizeof(*work), GFP_NOFS);
        if (!work)
                return NULL;
  
        init_completion(&work->completion);
        INIT_LIST_HEAD(&work->list);
        work->inode = inode;
-       work->wait = wait;
        work->delay_iput = delay_iput;
        WARN_ON_ONCE(!inode);
        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
  void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  {
        wait_for_completion(&work->completion);
-       kmem_cache_free(btrfs_delalloc_work_cachep, work);
+       kfree(work);
  }
  
  /*
@@@ -9496,7 -9515,7 +9517,7 @@@ static int __start_delalloc_inodes(stru
                }
                spin_unlock(&root->delalloc_lock);
  
-               work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+               work = btrfs_alloc_delalloc_work(inode, delay_iput);
                if (!work) {
                        if (delay_iput)
                                btrfs_add_delayed_iput(inode);
@@@ -9638,9 -9657,11 +9659,11 @@@ static int btrfs_symlink(struct inode *
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
+        * 1 item for updating parent inode item
+        * 1 item for the inline extent item
         * 1 item for xattr if selinux is on
         */
-       trans = btrfs_start_transaction(root, 5);
+       trans = btrfs_start_transaction(root, 7);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
  
        if (err)
                goto out_unlock_inode;
  
-       err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
-       if (err)
-               goto out_unlock_inode;
        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
        btrfs_free_path(path);
  
        inode->i_op = &btrfs_symlink_inode_operations;
 +      inode_nohighmem(inode);
        inode->i_mapping->a_ops = &btrfs_symlink_aops;
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
+       /*
+        * Last step, add directory indexes for our symlink inode. This is the
+        * last step to avoid extra cleanup of these indexes if an error happens
+        * elsewhere above.
+        */
+       if (!err)
+               err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err) {
                drop_inode = 1;
                goto out_unlock_inode;
@@@ -9762,7 -9785,7 +9788,7 @@@ static int __btrfs_prealloc_file_range(
                        }
                }
  
-               cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+               cur_bytes = min_t(u64, num_bytes, SZ_256M);
                cur_bytes = max(cur_bytes, min_size);
                /*
                 * If we are severely fragmented we could end up with really
@@@ -9997,7 -10020,7 +10023,7 @@@ static const struct inode_operations bt
        .setattr        = btrfs_setattr,
        .mknod          = btrfs_mknod,
        .setxattr       = btrfs_setxattr,
 -      .getxattr       = btrfs_getxattr,
 +      .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
@@@ -10026,7 -10049,7 +10052,7 @@@ static const struct file_operations btr
        .fsync          = btrfs_sync_file,
  };
  
- static struct extent_io_ops btrfs_extent_io_ops = {
+ static const struct extent_io_ops btrfs_extent_io_ops = {
        .fill_delalloc = run_delalloc_range,
        .submit_bio_hook = btrfs_submit_bio_hook,
        .merge_bio_hook = btrfs_merge_bio_hook,
@@@ -10074,7 -10097,7 +10100,7 @@@ static const struct inode_operations bt
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .setxattr       = btrfs_setxattr,
 -      .getxattr       = btrfs_getxattr,
 +      .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
@@@ -10088,7 -10111,7 +10114,7 @@@ static const struct inode_operations bt
        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
 -      .getxattr       = btrfs_getxattr,
 +      .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .get_acl        = btrfs_get_acl,
  };
  static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
 -      .follow_link    = page_follow_link_light,
 -      .put_link       = page_put_link,
 +      .get_link       = page_get_link,
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
 -      .getxattr       = btrfs_getxattr,
 +      .getxattr       = generic_getxattr,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .update_time    = btrfs_update_time,
diff --combined fs/btrfs/ioctl.c
index e21997385d148c7ede78fd5874e594577ed8a9e9,e392dd67f0ba07dd943dcc650175a0ab8a79cceb..2a47a3148ec80df57150e3f5aa7d321abb8d1ccd
@@@ -655,22 -655,28 +655,28 @@@ static int create_snapshot(struct btrfs
        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
                return -EINVAL;
  
+       pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+       if (!pending_snapshot)
+               return -ENOMEM;
+       pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+                       GFP_NOFS);
+       pending_snapshot->path = btrfs_alloc_path();
+       if (!pending_snapshot->root_item || !pending_snapshot->path) {
+               ret = -ENOMEM;
+               goto free_pending;
+       }
        atomic_inc(&root->will_be_snapshoted);
        smp_mb__after_atomic();
        btrfs_wait_for_no_snapshoting_writes(root);
  
        ret = btrfs_start_delalloc_inodes(root, 0);
        if (ret)
-               goto out;
+               goto dec_and_free;
  
        btrfs_wait_ordered_extents(root, -1);
  
-       pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-       if (!pending_snapshot) {
-               ret = -ENOMEM;
-               goto out;
-       }
        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                             BTRFS_BLOCK_RSV_TEMP);
        /*
                                        &pending_snapshot->qgroup_reserved,
                                        false);
        if (ret)
-               goto free;
+               goto dec_and_free;
  
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
@@@ -737,11 -743,14 +743,14 @@@ fail
        btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
                                         &pending_snapshot->block_rsv,
                                         pending_snapshot->qgroup_reserved);
- free:
-       kfree(pending_snapshot);
- out:
+ dec_and_free:
        if (atomic_dec_and_test(&root->will_be_snapshoted))
                wake_up_atomic_t(&root->will_be_snapshoted);
+ free_pending:
+       kfree(pending_snapshot->root_item);
+       btrfs_free_path(pending_snapshot->path);
+       kfree(pending_snapshot);
        return ret;
  }
  
@@@ -992,7 -1001,7 +1001,7 @@@ static struct extent_map *defrag_lookup
                u64 end = start + len - 1;
  
                /* get the big lock and read metadata off disk */
-               lock_extent_bits(io_tree, start, end, 0, &cached);
+               lock_extent_bits(io_tree, start, end, &cached);
                em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
                unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
  
@@@ -1016,7 -1025,7 +1025,7 @@@ static bool defrag_check_next_extent(st
        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
                ret = false;
        else if ((em->block_start + em->block_len == next->block_start) &&
-                (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+                (em->block_len > SZ_128K && next->block_len > SZ_128K))
                ret = false;
  
        free_extent_map(next);
@@@ -1140,7 -1149,7 +1149,7 @@@ again
                page_end = page_start + PAGE_CACHE_SIZE - 1;
                while (1) {
                        lock_extent_bits(tree, page_start, page_end,
-                                        0, &cached_state);
+                                        &cached_state);
                        ordered = btrfs_lookup_ordered_extent(inode,
                                                              page_start);
                        unlock_extent_cached(tree, page_start, page_end,
        page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
  
        lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                        page_start, page_end - 1, 0, &cached_state);
+                        page_start, page_end - 1, &cached_state);
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
                          page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
@@@ -1262,9 -1271,9 +1271,9 @@@ int btrfs_defrag_file(struct inode *ino
        int defrag_count = 0;
        int compress_type = BTRFS_COMPRESS_ZLIB;
        u32 extent_thresh = range->extent_thresh;
-       unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+       unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
        unsigned long cluster = max_cluster;
-       u64 new_align = ~((u64)128 * 1024 - 1);
+       u64 new_align = ~((u64)SZ_128K - 1);
        struct page **pages = NULL;
  
        if (isize == 0)
        }
  
        if (extent_thresh == 0)
-               extent_thresh = 256 * 1024;
+               extent_thresh = SZ_256K;
  
        /*
         * if we were not given a file, allocate a readahead
  
        if (newer_than) {
                ret = find_new_extents(root, inode, newer_than,
-                                      &newer_off, 64 * 1024);
+                                      &newer_off, SZ_64K);
                if (!ret) {
                        range->start = newer_off;
                        /*
                        newer_off = max(newer_off + 1,
                                        (u64)i << PAGE_CACHE_SHIFT);
  
-                       ret = find_new_extents(root, inode,
-                                              newer_than, &newer_off,
-                                              64 * 1024);
+                       ret = find_new_extents(root, inode, newer_than,
+                                              &newer_off, SZ_64K);
                        if (!ret) {
                                range->start = newer_off;
                                i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
@@@ -1571,7 -1579,7 +1579,7 @@@ static noinline int btrfs_ioctl_resize(
                new_size = old_size + new_size;
        }
  
-       if (new_size < 256 * 1024 * 1024) {
+       if (new_size < SZ_256M) {
                ret = -EINVAL;
                goto out_free;
        }
@@@ -2160,7 -2168,7 +2168,7 @@@ static noinline int btrfs_ioctl_tree_se
        struct inode *inode;
        int ret;
        size_t buf_size;
-       const size_t buf_limit = 16 * 1024 * 1024;
+       const size_t buf_limit = SZ_16M;
  
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@@ -2962,7 -2970,7 +2970,7 @@@ static int btrfs_cmp_data(struct inode 
                flush_dcache_page(dst_page);
  
                if (memcmp(addr, dst_addr, cmp_len))
 -                      ret = BTRFS_SAME_DATA_DIFFERS;
 +                      ret = -EBADE;
  
                kunmap_atomic(addr);
                kunmap_atomic(dst_addr);
@@@ -3096,18 -3104,55 +3104,18 @@@ out_unlock
        return ret;
  }
  
- #define BTRFS_MAX_DEDUPE_LEN  (16 * 1024 * 1024)
+ #define BTRFS_MAX_DEDUPE_LEN  SZ_16M
  
 -static long btrfs_ioctl_file_extent_same(struct file *file,
 -                      struct btrfs_ioctl_same_args __user *argp)
 +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 +                              struct file *dst_file, u64 dst_loff)
  {
 -      struct btrfs_ioctl_same_args *same = NULL;
 -      struct btrfs_ioctl_same_extent_info *info;
 -      struct inode *src = file_inode(file);
 -      u64 off;
 -      u64 len;
 -      int i;
 -      int ret;
 -      unsigned long size;
 +      struct inode *src = file_inode(src_file);
 +      struct inode *dst = file_inode(dst_file);
        u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
 -      bool is_admin = capable(CAP_SYS_ADMIN);
 -      u16 count;
 -
 -      if (!(file->f_mode & FMODE_READ))
 -              return -EINVAL;
 -
 -      ret = mnt_want_write_file(file);
 -      if (ret)
 -              return ret;
 -
 -      if (get_user(count, &argp->dest_count)) {
 -              ret = -EFAULT;
 -              goto out;
 -      }
 -
 -      size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
 -
 -      same = memdup_user(argp, size);
 -
 -      if (IS_ERR(same)) {
 -              ret = PTR_ERR(same);
 -              same = NULL;
 -              goto out;
 -      }
 +      ssize_t res;
  
 -      off = same->logical_offset;
 -      len = same->length;
 -
 -      /*
 -       * Limit the total length we will dedupe for each operation.
 -       * This is intended to bound the total time spent in this
 -       * ioctl to something sane.
 -       */
 -      if (len > BTRFS_MAX_DEDUPE_LEN)
 -              len = BTRFS_MAX_DEDUPE_LEN;
 +      if (olen > BTRFS_MAX_DEDUPE_LEN)
 +              olen = BTRFS_MAX_DEDUPE_LEN;
  
        if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
                /*
                 * result, btrfs_cmp_data() won't correctly handle
                 * this situation without an update.
                 */
 -              ret = -EINVAL;
 -              goto out;
 -      }
 -
 -      ret = -EISDIR;
 -      if (S_ISDIR(src->i_mode))
 -              goto out;
 -
 -      ret = -EACCES;
 -      if (!S_ISREG(src->i_mode))
 -              goto out;
 -
 -      /* pre-format output fields to sane values */
 -      for (i = 0; i < count; i++) {
 -              same->info[i].bytes_deduped = 0ULL;
 -              same->info[i].status = 0;
 -      }
 -
 -      for (i = 0, info = same->info; i < count; i++, info++) {
 -              struct inode *dst;
 -              struct fd dst_file = fdget(info->fd);
 -              if (!dst_file.file) {
 -                      info->status = -EBADF;
 -                      continue;
 -              }
 -              dst = file_inode(dst_file.file);
 -
 -              if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
 -                      info->status = -EINVAL;
 -              } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
 -                      info->status = -EXDEV;
 -              } else if (S_ISDIR(dst->i_mode)) {
 -                      info->status = -EISDIR;
 -              } else if (!S_ISREG(dst->i_mode)) {
 -                      info->status = -EACCES;
 -              } else {
 -                      info->status = btrfs_extent_same(src, off, len, dst,
 -                                                      info->logical_offset);
 -                      if (info->status == 0)
 -                              info->bytes_deduped += len;
 -              }
 -              fdput(dst_file);
 +              return -EINVAL;
        }
  
 -      ret = copy_to_user(argp, same, size);
 -      if (ret)
 -              ret = -EFAULT;
 -
 -out:
 -      mnt_drop_write_file(file);
 -      kfree(same);
 -      return ret;
 +      res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
 +      if (res)
 +              return res;
 +      return olen;
  }
  
  static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
@@@ -3396,7 -3486,7 +3404,7 @@@ static int btrfs_clone(struct inode *sr
                return ret;
        }
  
-       path->reada = 2;
+       path->reada = READA_FORWARD;
        /* clone data */
        key.objectid = btrfs_ino(src);
        key.type = BTRFS_EXTENT_DATA_KEY;
        return ret;
  }
  
 -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 -                                     u64 off, u64 olen, u64 destoff)
 +static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 +                                      u64 off, u64 olen, u64 destoff)
  {
        struct inode *inode = file_inode(file);
 +      struct inode *src = file_inode(file_src);
        struct btrfs_root *root = BTRFS_I(inode)->root;
 -      struct fd src_file;
 -      struct inode *src;
        int ret;
        u64 len = olen;
        u64 bs = root->fs_info->sb->s_blocksize;
 -      int same_inode = 0;
 +      int same_inode = src == inode;
  
        /*
         * TODO:
         *   be either compressed or non-compressed.
         */
  
 -      /* the destination must be opened for writing */
 -      if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
 -              return -EINVAL;
 -
        if (btrfs_root_readonly(root))
                return -EROFS;
  
 -      ret = mnt_want_write_file(file);
 -      if (ret)
 -              return ret;
 -
 -      src_file = fdget(srcfd);
 -      if (!src_file.file) {
 -              ret = -EBADF;
 -              goto out_drop_write;
 -      }
 -
 -      ret = -EXDEV;
 -      if (src_file.file->f_path.mnt != file->f_path.mnt)
 -              goto out_fput;
 -
 -      src = file_inode(src_file.file);
 -
 -      ret = -EINVAL;
 -      if (src == inode)
 -              same_inode = 1;
 -
 -      /* the src must be open for reading */
 -      if (!(src_file.file->f_mode & FMODE_READ))
 -              goto out_fput;
 +      if (file_src->f_path.mnt != file->f_path.mnt ||
 +          src->i_sb != inode->i_sb)
 +              return -EXDEV;
  
        /* don't make the dst file partly checksummed */
        if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
            (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
 -              goto out_fput;
 +              return -EINVAL;
  
 -      ret = -EISDIR;
        if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
 -              goto out_fput;
 -
 -      ret = -EXDEV;
 -      if (src->i_sb != inode->i_sb)
 -              goto out_fput;
 +              return -EISDIR;
  
        if (!same_inode) {
                btrfs_double_inode_lock(src, inode);
@@@ -3809,25 -3929,21 +3817,25 @@@ out_unlock
                btrfs_double_inode_unlock(src, inode);
        else
                mutex_unlock(&src->i_mutex);
 -out_fput:
 -      fdput(src_file);
 -out_drop_write:
 -      mnt_drop_write_file(file);
        return ret;
  }
  
 -static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
 +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
 +                            struct file *file_out, loff_t pos_out,
 +                            size_t len, unsigned int flags)
  {
 -      struct btrfs_ioctl_clone_range_args args;
 +      ssize_t ret;
  
 -      if (copy_from_user(&args, argp, sizeof(args)))
 -              return -EFAULT;
 -      return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
 -                               args.src_length, args.dest_offset);
 +      ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
 +      if (ret == 0)
 +              ret = len;
 +      return ret;
 +}
 +
 +int btrfs_clone_file_range(struct file *src_file, loff_t off,
 +              struct file *dst_file, loff_t destoff, u64 len)
 +{
 +      return btrfs_clone_files(dst_file, src_file, off, len, destoff);
  }
  
  /*
@@@ -4039,7 -4155,7 +4047,7 @@@ static long btrfs_ioctl_space_info(stru
                return -ENOMEM;
  
        space_args.total_spaces = 0;
-       dest = kmalloc(alloc_size, GFP_NOFS);
+       dest = kmalloc(alloc_size, GFP_KERNEL);
        if (!dest)
                return -ENOMEM;
        dest_orig = dest;
@@@ -4416,7 -4532,7 +4424,7 @@@ static long btrfs_ioctl_logical_to_ino(
                goto out;
        }
  
-       size = min_t(u32, loi->size, 64 * 1024);
+       size = min_t(u32, loi->size, SZ_64K);
        inodes = init_data_container(size);
        if (IS_ERR(inodes)) {
                ret = PTR_ERR(inodes);
@@@ -4565,7 -4681,7 +4573,7 @@@ locked
                goto out_bargs;
        }
  
-       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
        if (!bctl) {
                ret = -ENOMEM;
                goto out_bargs;
@@@ -4651,7 -4767,7 +4659,7 @@@ static long btrfs_ioctl_balance_progres
                goto out;
        }
  
-       bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+       bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
        if (!bargs) {
                ret = -ENOMEM;
                goto out;
@@@ -4911,7 -5027,7 +4919,7 @@@ static long btrfs_ioctl_quota_rescan_st
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+       qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
        if (!qsa)
                return -ENOMEM;
  
@@@ -5041,7 -5157,7 +5049,7 @@@ static long btrfs_ioctl_set_received_su
                goto out;
        }
  
-       args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+       args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
        if (!args64) {
                ret = -ENOMEM;
                goto out;
@@@ -5178,7 -5294,7 +5186,7 @@@ out_unlock
  static int btrfs_ioctl_get_supported_features(struct file *file,
                                              void __user *arg)
  {
-       static struct btrfs_ioctl_feature_flags features[3] = {
+       static const struct btrfs_ioctl_feature_flags features[3] = {
                INIT_FEATURE_FLAGS(SUPP),
                INIT_FEATURE_FLAGS(SAFE_SET),
                INIT_FEATURE_FLAGS(SAFE_CLEAR)
@@@ -5377,6 -5493,10 +5385,6 @@@ long btrfs_ioctl(struct file *file, uns
                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
                return btrfs_ioctl_balance(file, NULL);
 -      case BTRFS_IOC_CLONE:
 -              return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 -      case BTRFS_IOC_CLONE_RANGE:
 -              return btrfs_ioctl_clone_range(file, argp);
        case BTRFS_IOC_TRANS_START:
                return btrfs_ioctl_trans_start(file);
        case BTRFS_IOC_TRANS_END:
                return btrfs_ioctl_get_fslabel(file, argp);
        case BTRFS_IOC_SET_FSLABEL:
                return btrfs_ioctl_set_fslabel(file, argp);
 -      case BTRFS_IOC_FILE_EXTENT_SAME:
 -              return btrfs_ioctl_file_extent_same(file, argp);
        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
                return btrfs_ioctl_get_supported_features(file, argp);
        case BTRFS_IOC_GET_FEATURES:
diff --combined fs/btrfs/super.c
index a0434c179ea96b9f1308e7034066202b30cb4267,86f7fdc0563388b9122bf6abdb88b68ebec2ac85..9b9eab6d048e93d32963b0c66a6d9ab6c022ee63
@@@ -295,10 -295,11 +295,11 @@@ enum 
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-       Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
-       Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
-       Opt_check_integrity, Opt_check_integrity_including_extent_data,
+       Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+       Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+       Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+       Opt_skip_balance, Opt_check_integrity,
+       Opt_check_integrity_including_extent_data,
        Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
        Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
        Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
        Opt_err,
  };
  
- static match_table_t tokens = {
+ static const match_table_t tokens = {
        {Opt_degraded, "degraded"},
        {Opt_subvol, "subvol=%s"},
        {Opt_subvolid, "subvolid=%s"},
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
        {Opt_space_cache, "space_cache"},
+       {Opt_space_cache_version, "space_cache=%s"},
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
        {Opt_enospc_debug, "enospc_debug"},
@@@ -383,7 -385,9 +385,9 @@@ int btrfs_parse_options(struct btrfs_ro
        bool compress_force = false;
  
        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
-       if (cache_gen)
+       if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+               btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+       else if (cache_gen)
                btrfs_set_opt(info->mount_opt, SPACE_CACHE);
  
        if (!options)
                                             "turning off discard");
                        break;
                case Opt_space_cache:
-                       btrfs_set_and_info(root, SPACE_CACHE,
-                                          "enabling disk space caching");
+               case Opt_space_cache_version:
+                       if (token == Opt_space_cache ||
+                           strcmp(args[0].from, "v1") == 0) {
+                               btrfs_clear_opt(root->fs_info->mount_opt,
+                                               FREE_SPACE_TREE);
+                               btrfs_set_and_info(root, SPACE_CACHE,
+                                                  "enabling disk space caching");
+                       } else if (strcmp(args[0].from, "v2") == 0) {
+                               btrfs_clear_opt(root->fs_info->mount_opt,
+                                               SPACE_CACHE);
+                               btrfs_set_and_info(root, FREE_SPACE_TREE,
+                                                  "enabling free space tree");
+                       } else {
+                               ret = -EINVAL;
+                               goto out;
+                       }
                        break;
                case Opt_rescan_uuid_tree:
                        btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
                        break;
                case Opt_no_space_cache:
-                       btrfs_clear_and_info(root, SPACE_CACHE,
-                                            "disabling disk space caching");
+                       if (btrfs_test_opt(root, SPACE_CACHE)) {
+                               btrfs_clear_and_info(root, SPACE_CACHE,
+                                                    "disabling disk space caching");
+                       }
+                       if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+                               btrfs_clear_and_info(root, FREE_SPACE_TREE,
+                                                    "disabling free space tree");
+                       }
                        break;
                case Opt_inode_cache:
                        btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
                }
        }
  out:
+       if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+           !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+           !btrfs_test_opt(root, CLEAR_CACHE)) {
+               btrfs_err(root->fs_info, "cannot disable free space tree");
+               ret = -EINVAL;
+       }
        if (!ret && btrfs_test_opt(root, SPACE_CACHE))
                btrfs_info(root->fs_info, "disk space caching is enabled");
+       if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+               btrfs_info(root->fs_info, "using free space tree");
        kfree(orig);
        return ret;
  }
@@@ -1162,6 -1195,8 +1195,8 @@@ static int btrfs_show_options(struct se
                seq_puts(seq, ",noacl");
        if (btrfs_test_opt(root, SPACE_CACHE))
                seq_puts(seq, ",space_cache");
+       else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+               seq_puts(seq, ",space_cache=v2");
        else
                seq_puts(seq, ",nospace_cache");
        if (btrfs_test_opt(root, RESCAN_UUID_TREE))
@@@ -1514,7 -1549,9 +1549,7 @@@ static struct dentry *btrfs_mount(struc
                if ((flags ^ s->s_flags) & MS_RDONLY)
                        error = -EBUSY;
        } else {
 -              char b[BDEVNAME_SIZE];
 -
 -              strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 +              snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
                btrfs_sb(s)->bdev_holder = fs_type;
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
@@@ -1863,7 -1900,7 +1898,7 @@@ static int btrfs_calc_avail_data_space(
                 * btrfs starts at an offset of at least 1MB when doing chunk
                 * allocation.
                 */
-               skip_space = 1024 * 1024;
+               skip_space = SZ_1M;
  
                /* user can set the offset in fs_info->alloc_start. */
                if (fs_info->alloc_start &&
   * there are other factors that may change the result (like a new metadata
   * chunk).
   *
+  * If metadata is exhausted, f_bavail will be 0.
+  *
   * FIXME: not accurate for mixed block groups, total and free/used are ok,
   * available appears slightly larger.
   */
@@@ -1965,11 -2004,13 +2002,13 @@@ static int btrfs_statfs(struct dentry *
        struct btrfs_space_info *found;
        u64 total_used = 0;
        u64 total_free_data = 0;
+       u64 total_free_meta = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)fs_info->fsid;
        unsigned factor = 1;
        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        int ret;
+       u64 thresh = 0;
  
        /*
         * holding chunk_muext to avoid allocating new chunks, holding
                                }
                        }
                }
+               if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+                       total_free_meta += found->disk_total - found->disk_used;
  
                total_used += found->disk_used;
        }
        buf->f_bavail += div_u64(total_free_data, factor);
        buf->f_bavail = buf->f_bavail >> bits;
  
+       /*
+        * We calculate the remaining metadata space minus global reserve. If
+        * this is (supposedly) smaller than zero, there's no space. But this
+        * does not hold in practice, the exhausted state happens where's still
+        * some positive delta. So we apply some guesswork and compare the
+        * delta to a 4M threshold.  (Practically observed delta was ~2M.)
+        *
+        * We probably cannot calculate the exact threshold value because this
+        * depends on the internal reservations requested by various
+        * operations, so some operations that consume a few metadata will
+        * succeed even if the Avail is zero. But this is better than the other
+        * way around.
+        */
+       thresh = 4 * 1024 * 1024;
+       if (total_free_meta - thresh < block_rsv->size)
+               buf->f_bavail = 0;
        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_namelen = BTRFS_NAME_LEN;
@@@ -2223,6 -2284,9 +2282,9 @@@ static int btrfs_run_sanity_tests(void
        if (ret)
                goto out;
        ret = btrfs_test_qgroups();
+       if (ret)
+               goto out;
+       ret = btrfs_test_free_space_tree();
  out:
        btrfs_destroy_test_fs();
        return ret;
diff --combined fs/btrfs/xattr.c
index 7cbef1a14fe1b13bc3af4c63f47efb3a6f83dbad,608552ed89c078fc953679d84b0016a9a1db77af..fd953c361a43c7c7f0faf3e100df12b052c06552
@@@ -283,7 -283,7 +283,7 @@@ ssize_t btrfs_listxattr(struct dentry *
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-       path->reada = 2;
+       path->reada = READA_FORWARD;
  
        /* search for our xattrs */
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@@ -351,89 -351,137 +351,89 @@@ err
        return ret;
  }
  
 -/*
 - * List of handlers for synthetic system.* attributes.  All real ondisk
 - * attributes are handled directly.
 - */
 -const struct xattr_handler *btrfs_xattr_handlers[] = {
 -#ifdef CONFIG_BTRFS_FS_POSIX_ACL
 -      &posix_acl_access_xattr_handler,
 -      &posix_acl_default_xattr_handler,
 -#endif
 -      NULL,
 -};
 -
 -/*
 - * Check if the attribute is in a supported namespace.
 - *
 - * This is applied after the check for the synthetic attributes in the system
 - * namespace.
 - */
 -static int btrfs_is_valid_xattr(const char *name)
 +static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 +                                 struct dentry *dentry, const char *name,
 +                                 void *buffer, size_t size)
  {
 -      int len = strlen(name);
 -      int prefixlen = 0;
 -
 -      if (!strncmp(name, XATTR_SECURITY_PREFIX,
 -                      XATTR_SECURITY_PREFIX_LEN))
 -              prefixlen = XATTR_SECURITY_PREFIX_LEN;
 -      else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 -              prefixlen = XATTR_SYSTEM_PREFIX_LEN;
 -      else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
 -              prefixlen = XATTR_TRUSTED_PREFIX_LEN;
 -      else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
 -              prefixlen = XATTR_USER_PREFIX_LEN;
 -      else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
 -              prefixlen = XATTR_BTRFS_PREFIX_LEN;
 -      else
 -              return -EOPNOTSUPP;
 -
 -      /*
 -       * The name cannot consist of just prefix
 -       */
 -      if (len <= prefixlen)
 -              return -EINVAL;
 +      struct inode *inode = d_inode(dentry);
  
 -      return 0;
 +      name = xattr_full_name(handler, name);
 +      return __btrfs_getxattr(inode, name, buffer, size);
  }
  
 -ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 -                     void *buffer, size_t size)
 +static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
 +                                 struct dentry *dentry, const char *name,
 +                                 const void *buffer, size_t size,
 +                                 int flags)
  {
 -      int ret;
 +      struct inode *inode = d_inode(dentry);
  
 -      /*
 -       * If this is a request for a synthetic attribute in the system.*
 -       * namespace use the generic infrastructure to resolve a handler
 -       * for it via sb->s_xattr.
 -       */
 -      if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 -              return generic_getxattr(dentry, name, buffer, size);
 +      name = xattr_full_name(handler, name);
 +      return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
 +}
  
 -      ret = btrfs_is_valid_xattr(name);
 -      if (ret)
 -              return ret;
 -      return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
 +static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
 +                                      struct dentry *dentry,
 +                                      const char *name, const void *value,
 +                                      size_t size, int flags)
 +{
 +      name = xattr_full_name(handler, name);
 +      return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
  }
  
 +static const struct xattr_handler btrfs_security_xattr_handler = {
 +      .prefix = XATTR_SECURITY_PREFIX,
 +      .get = btrfs_xattr_handler_get,
 +      .set = btrfs_xattr_handler_set,
 +};
 +
 +static const struct xattr_handler btrfs_trusted_xattr_handler = {
 +      .prefix = XATTR_TRUSTED_PREFIX,
 +      .get = btrfs_xattr_handler_get,
 +      .set = btrfs_xattr_handler_set,
 +};
 +
 +static const struct xattr_handler btrfs_user_xattr_handler = {
 +      .prefix = XATTR_USER_PREFIX,
 +      .get = btrfs_xattr_handler_get,
 +      .set = btrfs_xattr_handler_set,
 +};
 +
 +static const struct xattr_handler btrfs_btrfs_xattr_handler = {
 +      .prefix = XATTR_BTRFS_PREFIX,
 +      .get = btrfs_xattr_handler_get,
 +      .set = btrfs_xattr_handler_set_prop,
 +};
 +
 +const struct xattr_handler *btrfs_xattr_handlers[] = {
 +      &btrfs_security_xattr_handler,
 +#ifdef CONFIG_BTRFS_FS_POSIX_ACL
 +      &posix_acl_access_xattr_handler,
 +      &posix_acl_default_xattr_handler,
 +#endif
 +      &btrfs_trusted_xattr_handler,
 +      &btrfs_user_xattr_handler,
 +      &btrfs_btrfs_xattr_handler,
 +      NULL,
 +};
 +
  int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                   size_t size, int flags)
  {
        struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
 -      int ret;
  
 -      /*
 -       * The permission on security.* and system.* is not checked
 -       * in permission().
 -       */
        if (btrfs_root_readonly(root))
                return -EROFS;
 -
 -      /*
 -       * If this is a request for a synthetic attribute in the system.*
 -       * namespace use the generic infrastructure to resolve a handler
 -       * for it via sb->s_xattr.
 -       */
 -      if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 -              return generic_setxattr(dentry, name, value, size, flags);
 -
 -      ret = btrfs_is_valid_xattr(name);
 -      if (ret)
 -              return ret;
 -
 -      if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
 -              return btrfs_set_prop(d_inode(dentry), name,
 -                                    value, size, flags);
 -
 -      if (size == 0)
 -              value = "";  /* empty EA, do not remove */
 -
 -      return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
 -                              flags);
 +      return generic_setxattr(dentry, name, value, size, flags);
  }
  
  int btrfs_removexattr(struct dentry *dentry, const char *name)
  {
        struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
 -      int ret;
  
 -      /*
 -       * The permission on security.* and system.* is not checked
 -       * in permission().
 -       */
        if (btrfs_root_readonly(root))
                return -EROFS;
 -
 -      /*
 -       * If this is a request for a synthetic attribute in the system.*
 -       * namespace use the generic infrastructure to resolve a handler
 -       * for it via sb->s_xattr.
 -       */
 -      if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 -              return generic_removexattr(dentry, name);
 -
 -      ret = btrfs_is_valid_xattr(name);
 -      if (ret)
 -              return ret;
 -
 -      if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
 -              return btrfs_set_prop(d_inode(dentry), name,
 -                                    NULL, 0, XATTR_REPLACE);
 -
 -      return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
 -                              XATTR_REPLACE);
 +      return generic_removexattr(dentry, name);
  }
  
  static int btrfs_initxattrs(struct inode *inode,
  
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
-                              strlen(xattr->name) + 1, GFP_NOFS);
+                              strlen(xattr->name) + 1, GFP_KERNEL);
                if (!name) {
                        err = -ENOMEM;
                        break;