switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
size = __btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
+ value = kzalloc(size, GFP_KERNEL);
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EINVAL : 0;
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_NOFS);
+ value = kmalloc(size, GFP_KERNEL);
if (!value) {
ret = -ENOMEM;
goto out;
#include <linux/btrfs.h>
#include <linux/workqueue.h>
#include <linux/security.h>
+ #include <linux/sizes.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
/* for storing items that use the BTRFS_UUID_KEY* types */
#define BTRFS_UUID_TREE_OBJECTID 9ULL
+ /* tracks free space in block groups. */
+ #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
- static int btrfs_csum_sizes[] = { 4 };
+ static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
- #define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+ #define BTRFS_DIRTY_METADATA_THRESH SZ_32M
- #define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+ #define BTRFS_MAX_EXTENT_SIZE SZ_128M
/*
* The key defines the order in the tree, and so it also defines (optimal)
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
+ #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
+
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
- #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+
+ #define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
+ enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
/* if there is real range locking, this locks field will change */
- int locks[BTRFS_MAX_LEVEL];
- int reada;
+ u8 locks[BTRFS_MAX_LEVEL];
+ u8 reada;
/* keep some upper locks as we walk down */
- int lowest_level;
+ u8 lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
__le64 flags;
} __attribute__ ((__packed__));
+ struct btrfs_free_space_info {
+ __le32 extent_count;
+ __le32 flags;
+ } __attribute__ ((__packed__));
+
+ #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+
#define BTRFS_QGROUP_LEVEL_SHIFT 48
static inline u64 btrfs_qgroup_level(u64 qgroupid)
{
atomic_t count;
};
+ /* Once caching_thread() finds this much free space, it will wake up waiters. */
+ #define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+
struct btrfs_io_ctl {
void *cur, *orig;
struct page *page;
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
- u64 sectorsize;
u64 cache_generation;
+ u32 sectorsize;
+
+ /*
+ * If the free space extent count exceeds this number, convert the block
+ * group to bitmaps.
+ */
+ u32 bitmap_high_thresh;
+
+ /*
+ * If the free space extent count drops below this number, convert the
+ * block group back to extents.
+ */
+ u32 bitmap_low_thresh;
/*
* It is just used for the delayed data space allocation because
struct list_head io_list;
struct btrfs_io_ctl io_ctl;
+
+ /* Lock for free space tree operations. */
+ struct mutex free_space_lock;
+
+ /*
+ * Does the block group need to be added to the free space tree?
+ * Protected by free_space_lock.
+ */
+ int needs_free_space;
};
/* delayed seq elem */
struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
+ struct btrfs_root *free_space_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
* and will be latter freed. Protected by fs_info->chunk_mutex.
*/
struct list_head pinned_chunks;
+
+ int creating_free_space_tree;
};
struct btrfs_subvolume_writers {
*/
#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+ /*
+ * Every block group is represented in the free space tree by a free space info
+ * item, which stores some accounting information. It is keyed on
+ * (block_group_start, FREE_SPACE_INFO, block_group_length).
+ */
+ #define BTRFS_FREE_SPACE_INFO_KEY 198
+
+ /*
+ * A free space extent tracks an extent of space that is free in a block group.
+ * It is keyed on (start, FREE_SPACE_EXTENT, length).
+ */
+ #define BTRFS_FREE_SPACE_EXTENT_KEY 199
+
+ /*
+ * When a block group becomes very fragmented, we convert it to use bitmaps
+ * instead of extents. A free space bitmap is keyed on
+ * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+ * (length / sectorsize) bits.
+ */
+ #define BTRFS_FREE_SPACE_BITMAP_KEY 200
+
#define BTRFS_DEV_EXTENT_KEY 204
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
+ #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
BTRFS_SETGET_STACK_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
+ /* struct btrfs_free_space_info */
+ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+ BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
/* struct btrfs_inode_ref */
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const u64 type);
+ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end);
+
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
kfree(fs_info->csum_root);
kfree(fs_info->quota_root);
kfree(fs_info->uuid_root);
+ kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
security_free_mnt_opts(&fs_info->security_opts);
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
- int wait;
int delay_iput;
struct completion completion;
struct list_head list;
};
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput);
+ int delay_iput);
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
struct btrfs_ioctl_space_info *space);
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
struct btrfs_ioctl_balance_args *bargs);
-
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff);
/* file.c */
int btrfs_auto_defrag_init(void);
loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags);
+int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
}
}
+ #define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+ {
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+ }
+
#define btrfs_fs_incompat(fs_info, opt) \
__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
- static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
{
struct btrfs_super_block *disk_super;
disk_super = fs_info->super_copy;
return !!(btrfs_super_incompat_flags(disk_super) & flag);
}
+ #define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+ {
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "setting %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+ }
+
+ #define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+ {
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+ }
+
+ #define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+ {
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+ }
+
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+ #include "free-space-tree.h"
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state);
+ &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
- if (cpu_has_xmm4_2)
+ if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
return 0;
#endif
return 1;
if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
return fs_info->uuid_root ? fs_info->uuid_root :
ERR_PTR(-ENOENT);
+ if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return fs_info->free_space_root ? fs_info->free_space_root :
+ ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root) {
free_root_extent_buffers(info->uuid_root);
if (chunk_root)
free_root_extent_buffers(info->chunk_root);
+ free_root_extent_buffers(info->free_space_root);
}
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
+ }
+
return 0;
}
if (btrfs_check_super_csum(bh->b_data)) {
printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
err = -EINVAL;
+ brelse(bh);
goto fail_alloc;
}
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+ SZ_4M / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
tree_root->sectorsize = sectorsize;
if (sb->s_flags & MS_RDONLY)
return 0;
+ if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: creating free space tree\n");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to create free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
down_read(&fs_info->cleanup_work_sem);
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
btrfs_qgroup_rescan_resume(fs_info);
+ if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: clearing free space tree\n");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to clear free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
if (!fs_info->uuid_root) {
pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
return !ret;
}
- int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
- {
- return set_extent_buffer_uptodate(buf);
- }
-
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
struct btrfs_root *root;
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
}
- return;
}
void btrfs_btree_balance_dirty(struct btrfs_root *root)
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos, 0, cached_state);
+ start_pos, last_pos, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
/*
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state);
+ locked_end, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
lockend--;
len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
while (start < inode->i_size) {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+ .copy_file_range = btrfs_copy_file_range,
+ .clone_file_range = btrfs_clone_file_range,
+ .dedupe_file_range = btrfs_dedupe_file_range,
};
void btrfs_auto_defrag_exit(void)
struct btrfs_root *root;
};
+ struct btrfs_dio_data {
+ u64 outstanding_extents;
+ u64 reserve;
+ u64 unsubmitted_oe_range_start;
+ u64 unsubmitted_oe_range_end;
+ };
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
- static struct extent_io_ops btrfs_extent_io_ops;
+ static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
- static struct kmem_cache *btrfs_delalloc_work_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
#define S_SHIFT 12
- static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
unsigned long nr_pages_ret = 0;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned long max_compressed = 128 * 1024;
- unsigned long max_uncompressed = 128 * 1024;
+ unsigned long max_compressed = SZ_128K;
+ unsigned long max_uncompressed = SZ_128K;
int i;
int will_compress;
int compress_type = root->fs_info->compress_type;
int redirty = 0;
/* if this is a small write inside eof, kick off a defrag */
- if ((end - start + 1) < 16 * 1024 &&
+ if ((end - start + 1) < SZ_16K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
again:
will_compress = 0;
nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
- nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+ nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
/*
* we don't want to send crud past the end of i_size through
disk_num_bytes = num_bytes;
/* if this is a small write inside eof, kick off defrag */
- if (num_bytes < 64 * 1024 &&
+ if (num_bytes < SZ_64K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
* atomic_sub_return implies a barrier for waitqueue_active
*/
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
- 5 * 1024 * 1024 &&
+ 5 * SZ_1M &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
- int limit = 10 * 1024 * 1024;
+ int limit = 10 * SZ_1M;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1, 0, NULL, GFP_NOFS);
!btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
- cur_end = min(end, start + 512 * 1024 - 1);
+ cur_end = min(end, start + SZ_512K - 1);
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
/* already ordered? We're done */
lock_start = backref->file_pos;
lock_end = backref->file_pos + backref->num_bytes - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- 0, &cached);
+ &cached);
ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
if (ordered) {
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- 0, &cached_state);
+ &cached_state);
ret = test_range_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
start, (size_t)(end - start + 1));
}
- struct delayed_iput {
- struct list_head list;
- struct inode *inode;
- };
-
- /* JDM: If this is fs-wide, why can't we add a pointer to
- * btrfs_inode instead and avoid the allocation? */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct delayed_iput *delayed;
+ struct btrfs_inode *binode = BTRFS_I(inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
- delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
- delayed->inode = inode;
-
spin_lock(&fs_info->delayed_iput_lock);
- list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ if (binode->delayed_iput_count == 0) {
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ } else {
+ binode->delayed_iput_count++;
+ }
spin_unlock(&fs_info->delayed_iput_lock);
}
void btrfs_run_delayed_iputs(struct btrfs_root *root)
{
- LIST_HEAD(list);
struct btrfs_fs_info *fs_info = root->fs_info;
- struct delayed_iput *delayed;
- int empty;
-
- spin_lock(&fs_info->delayed_iput_lock);
- empty = list_empty(&fs_info->delayed_iputs);
- spin_unlock(&fs_info->delayed_iput_lock);
- if (empty)
- return;
down_read(&fs_info->delayed_iput_sem);
-
spin_lock(&fs_info->delayed_iput_lock);
- list_splice_init(&fs_info->delayed_iputs, &list);
- spin_unlock(&fs_info->delayed_iput_lock);
-
- while (!list_empty(&list)) {
- delayed = list_entry(list.next, struct delayed_iput, list);
- list_del(&delayed->list);
- iput(delayed->inode);
- kfree(delayed);
+ while (!list_empty(&fs_info->delayed_iputs)) {
+ struct btrfs_inode *inode;
+
+ inode = list_first_entry(&fs_info->delayed_iputs,
+ struct btrfs_inode, delayed_iput);
+ if (inode->delayed_iput_count) {
+ inode->delayed_iput_count--;
+ list_move_tail(&inode->delayed_iput,
+ &fs_info->delayed_iputs);
+ } else {
+ list_del_init(&inode->delayed_iput);
+ }
+ spin_unlock(&fs_info->delayed_iput_lock);
+ iput(&inode->vfs_inode);
+ spin_lock(&fs_info->delayed_iput_lock);
}
-
+ spin_unlock(&fs_info->delayed_iput_lock);
up_read(&root->fs_info->delayed_iput_sem);
}
ret = -ENOMEM;
goto out;
}
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
int scanned = 0;
if (!xattr_access) {
- xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
- xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT));
+ xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+ xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
}
slot++;
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
break;
default:
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
/*
* We want to drop from the next block forward in case this new size is
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
if (btrfs_should_end_transaction(trans, root)) {
err = -EAGAIN;
goto error;
btrfs_free_path(path);
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+ lock_extent_bits(io_tree, hole_start, block_end - 1,
&cached_state);
ordered = btrfs_lookup_ordered_range(inode, hole_start,
block_end - hole_start);
end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ lock_extent_bits(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
no_delete:
btrfs_remove_delayed_node(inode);
clear_inode(inode);
- return;
}
/*
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
if (key_type == BTRFS_DIR_INDEX_KEY) {
INIT_LIST_HEAD(&ins_list);
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
u64 index;
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
goto fail;
}
btrfs_log_new_name(trans, inode, NULL, parent);
}
- btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
fail:
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct inode *inode, struct page *page,
+ struct page *page,
size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
* Chances are we'll be called again, so go ahead and do
* readahead
*/
- path->reada = 1;
+ path->reada = READA_FORWARD;
}
ret = btrfs_lookup_file_extent(trans, root, path,
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
- ret = uncompress_inline(path, inode, page,
- pg_offset,
+ ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
if (ret) {
err = ret;
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, cached_state);
+ cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
} else {
- /* Screw you mmap */
- ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
- if (ret)
- break;
- ret = filemap_fdatawait_range(inode->i_mapping,
- lockstart,
- lockend);
- if (ret)
- break;
-
/*
- * If we found a page that couldn't be invalidated just
- * fall back to buffered.
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readpages() (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readpages() wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
*/
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- lockstart >> PAGE_CACHE_SHIFT,
- lockend >> PAGE_CACHE_SHIFT);
- if (ret)
- break;
+ ret = -ENOTBLK;
+ break;
}
cond_resched();
return em;
}
- struct btrfs_dio_data {
- u64 outstanding_extents;
- u64 reserve;
- };
-
static void adjust_dio_outstanding_extents(struct inode *inode,
struct btrfs_dio_data *dio_data,
const u64 len)
btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
current->journal_info = dio_data;
}
bio_put(bio);
}
- static void btrfs_endio_direct_write(struct bio *bio)
+ static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+ const u64 offset,
+ const u64 bytes,
+ const int uptodate)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered = NULL;
- u64 ordered_offset = dip->logical_offset;
- u64 ordered_bytes = dip->bytes;
- struct bio *dio_bio;
+ u64 ordered_offset = offset;
+ u64 ordered_bytes = bytes;
int ret;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
ordered_bytes,
- !bio->bi_error);
+ uptodate);
if (!ret)
goto out_test;
* our bio might span multiple ordered extents. If we haven't
* completed the accounting for the whole dio, go back and try again
*/
- if (ordered_offset < dip->logical_offset + dip->bytes) {
- ordered_bytes = dip->logical_offset + dip->bytes -
- ordered_offset;
+ if (ordered_offset < offset + bytes) {
+ ordered_bytes = offset + bytes - ordered_offset;
ordered = NULL;
goto again;
}
- dio_bio = dip->dio_bio;
+ }
+
+ static void btrfs_endio_direct_write(struct bio *bio)
+ {
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct bio *dio_bio = dip->dio_bio;
+
+ btrfs_endio_direct_write_update_ordered(dip->inode,
+ dip->logical_offset,
+ dip->bytes,
+ !bio->bi_error);
kfree(dip);
dip->subio_endio = btrfs_subio_endio_read;
}
+ /*
+ * Reset the range for unsubmitted ordered extents (to a 0 length range)
+ * even if we fail to submit a bio, because in such case we do the
+ * corresponding error handling below and it must not be done a second
+ * time by btrfs_direct_IO().
+ */
+ if (write) {
+ struct btrfs_dio_data *dio_data = current->journal_info;
+
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+ dip->bytes;
+ dio_data->unsubmitted_oe_range_start =
+ dio_data->unsubmitted_oe_range_end;
+ }
+
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
dip = NULL;
io_bio = NULL;
} else {
- if (write) {
- struct btrfs_ordered_extent *ordered;
-
- ordered = btrfs_lookup_ordered_extent(inode,
- file_offset);
- set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- /*
- * Decrements our ref on the ordered extent and removes
- * the ordered extent from the inode's ordered tree,
- * doing all the proper resource cleanup such as for the
- * reserved space and waking up any waiters for this
- * ordered extent (through btrfs_remove_ordered_extent).
- */
- btrfs_finish_ordered_io(ordered);
- } else {
+ if (write)
+ btrfs_endio_direct_write_update_ordered(inode,
+ file_offset,
+ dio_bio->bi_iter.bi_size,
+ 0);
+ else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
- }
+
dio_bio->bi_error = -EIO;
/*
* Releases and cleans up our dio_bio, no need to bio_put()
* originally calculated. Abuse current->journal_info for this.
*/
dio_data.reserve = round_up(count, root->sectorsize);
+ dio_data.unsubmitted_oe_range_start = (u64)offset;
+ dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
if (dio_data.reserve)
btrfs_delalloc_release_space(inode, offset,
dio_data.reserve);
+ /*
+ * On error we might have left some ordered extents
+ * without submitting corresponding bios for them, so
+ * cleanup them up to avoid other tasks getting them
+ * and waiting for them to complete forever.
+ */
+ if (dio_data.unsubmitted_oe_range_start <
+ dio_data.unsubmitted_oe_range_end)
+ btrfs_endio_direct_write_update_ordered(inode,
+ dio_data.unsubmitted_oe_range_start,
+ dio_data.unsubmitted_oe_range_end -
+ dio_data.unsubmitted_oe_range_start,
+ 0);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
-
+ struct inode *inode = page->mapping->host;
+ int ret;
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
+
+ /*
+ * If we are under memory pressure we will call this directly from the
+ * VM, we need to make sure we have the inode referenced for the ordered
+ * extent. If not just return like we didn't do anything.
+ */
+ if (!igrab(inode)) {
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ btrfs_add_delayed_iput(inode);
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(tree, page_start, page_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
/*
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0,
+ lock_extent_bits(tree, page_start, page_end,
&cached_state);
}
}
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
/*
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
+ ei->delayed_iput_count = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
+ INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
return inode;
kmem_cache_destroy(btrfs_path_cachep);
if (btrfs_free_space_cachep)
kmem_cache_destroy(btrfs_free_space_cachep);
- if (btrfs_delalloc_work_cachep)
- kmem_cache_destroy(btrfs_delalloc_work_cachep);
}
int btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
if (!btrfs_inode_cachep)
goto fail;
if (!btrfs_free_space_cachep)
goto fail;
- btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
- sizeof(struct btrfs_delalloc_work), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_delalloc_work_cachep)
- goto fail;
-
return 0;
fail:
btrfs_destroy_cachep();
delalloc_work = container_of(work, struct btrfs_delalloc_work,
work);
inode = delalloc_work->inode;
- if (delalloc_work->wait) {
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- } else {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
if (delalloc_work->delay_iput)
btrfs_add_delayed_iput(inode);
}
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput)
+ int delay_iput)
{
struct btrfs_delalloc_work *work;
- work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ work = kmalloc(sizeof(*work), GFP_NOFS);
if (!work)
return NULL;
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- work->wait = wait;
work->delay_iput = delay_iput;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
{
wait_for_completion(&work->completion);
- kmem_cache_free(btrfs_delalloc_work_cachep, work);
+ kfree(work);
}
/*
}
spin_unlock(&root->delalloc_lock);
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ work = btrfs_alloc_delalloc_work(inode, delay_iput);
if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
/*
* 2 items for inode item and ref
* 2 items for dir items
+ * 1 item for updating parent inode item
+ * 1 item for the inline extent item
* 1 item for xattr if selinux is on
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, 7);
if (IS_ERR(trans))
return PTR_ERR(trans);
if (err)
goto out_unlock_inode;
- err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
- if (err)
- goto out_unlock_inode;
-
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
btrfs_free_path(path);
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
+ /*
+ * Last step, add directory indexes for our symlink inode. This is the
+ * last step to avoid extra cleanup of these indexes if an error happens
+ * elsewhere above.
+ */
+ if (!err)
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err) {
drop_inode = 1;
goto out_unlock_inode;
}
}
- cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+ cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
* If we are severely fragmented we could end up with really
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.fsync = btrfs_sync_file,
};
- static struct extent_io_ops btrfs_extent_io_ops = {
+ static const struct extent_io_ops btrfs_extent_io_ops = {
.fill_delalloc = run_delalloc_range,
.submit_bio_hook = btrfs_submit_bio_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.update_time = btrfs_update_time,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+ GFP_NOFS);
+ pending_snapshot->path = btrfs_alloc_path();
+ if (!pending_snapshot->root_item || !pending_snapshot->path) {
+ ret = -ENOMEM;
+ goto free_pending;
+ }
+
atomic_inc(&root->will_be_snapshoted);
smp_mb__after_atomic();
btrfs_wait_for_no_snapshoting_writes(root);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- goto out;
+ goto dec_and_free;
btrfs_wait_ordered_extents(root, -1);
- pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- goto out;
- }
-
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto free;
+ goto dec_and_free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
- free:
- kfree(pending_snapshot);
- out:
+ dec_and_free:
if (atomic_dec_and_test(&root->will_be_snapshoted))
wake_up_atomic_t(&root->will_be_snapshoted);
+ free_pending:
+ kfree(pending_snapshot->root_item);
+ btrfs_free_path(pending_snapshot->path);
+ kfree(pending_snapshot);
+
return ret;
}
u64 end = start + len - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, 0, &cached);
+ lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+ (em->block_len > SZ_128K && next->block_len > SZ_128K))
ret = false;
free_extent_map(next);
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
lock_extent_bits(tree, page_start, page_end,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, 0, &cached_state);
+ page_start, page_end - 1, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)128 * 1024 - 1);
+ u64 new_align = ~((u64)SZ_128K - 1);
struct page **pages = NULL;
if (isize == 0)
}
if (extent_thresh == 0)
- extent_thresh = 256 * 1024;
+ extent_thresh = SZ_256K;
/*
* if we were not given a file, allocate a readahead
if (newer_than) {
ret = find_new_extents(root, inode, newer_than,
- &newer_off, 64 * 1024);
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
/*
newer_off = max(newer_off + 1,
(u64)i << PAGE_CACHE_SHIFT);
- ret = find_new_extents(root, inode,
- newer_than, &newer_off,
- 64 * 1024);
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
new_size = old_size + new_size;
}
- if (new_size < 256 * 1024 * 1024) {
+ if (new_size < SZ_256M) {
ret = -EINVAL;
goto out_free;
}
struct inode *inode;
int ret;
size_t buf_size;
- const size_t buf_limit = 16 * 1024 * 1024;
+ const size_t buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
flush_dcache_page(dst_page);
if (memcmp(addr, dst_addr, cmp_len))
- ret = BTRFS_SAME_DATA_DIFFERS;
+ ret = -EBADE;
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
return ret;
}
- #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+ #define BTRFS_MAX_DEDUPE_LEN SZ_16M
-static long btrfs_ioctl_file_extent_same(struct file *file,
- struct btrfs_ioctl_same_args __user *argp)
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff)
{
- struct btrfs_ioctl_same_args *same = NULL;
- struct btrfs_ioctl_same_extent_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- unsigned long size;
+ struct inode *src = file_inode(src_file);
+ struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- bool is_admin = capable(CAP_SYS_ADMIN);
- u16 count;
-
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- if (get_user(count, &argp->dest_count)) {
- ret = -EFAULT;
- goto out;
- }
-
- size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
-
- same = memdup_user(argp, size);
-
- if (IS_ERR(same)) {
- ret = PTR_ERR(same);
- same = NULL;
- goto out;
- }
+ ssize_t res;
- off = same->logical_offset;
- len = same->length;
-
- /*
- * Limit the total length we will dedupe for each operation.
- * This is intended to bound the total time spent in this
- * ioctl to something sane.
- */
- if (len > BTRFS_MAX_DEDUPE_LEN)
- len = BTRFS_MAX_DEDUPE_LEN;
+ if (olen > BTRFS_MAX_DEDUPE_LEN)
+ olen = BTRFS_MAX_DEDUPE_LEN;
if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
/*
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
- ret = -EINVAL;
- goto out;
- }
-
- ret = -EISDIR;
- if (S_ISDIR(src->i_mode))
- goto out;
-
- ret = -EACCES;
- if (!S_ISREG(src->i_mode))
- goto out;
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = 0;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct inode *dst;
- struct fd dst_file = fdget(info->fd);
- if (!dst_file.file) {
- info->status = -EBADF;
- continue;
- }
- dst = file_inode(dst_file.file);
-
- if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
- info->status = -EINVAL;
- } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
- info->status = -EXDEV;
- } else if (S_ISDIR(dst->i_mode)) {
- info->status = -EISDIR;
- } else if (!S_ISREG(dst->i_mode)) {
- info->status = -EACCES;
- } else {
- info->status = btrfs_extent_same(src, off, len, dst,
- info->logical_offset);
- if (info->status == 0)
- info->bytes_deduped += len;
- }
- fdput(dst_file);
+ return -EINVAL;
}
- ret = copy_to_user(argp, same, size);
- if (ret)
- ret = -EFAULT;
-
-out:
- mnt_drop_write_file(file);
- kfree(same);
- return ret;
+ res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
+ if (res)
+ return res;
+ return olen;
}
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
return ret;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
return ret;
}
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct fd src_file;
- struct inode *src;
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
- int same_inode = 0;
+ int same_inode = src == inode;
/*
* TODO:
* be either compressed or non-compressed.
*/
- /* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
- return -EINVAL;
-
if (btrfs_root_readonly(root))
return -EROFS;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- src_file = fdget(srcfd);
- if (!src_file.file) {
- ret = -EBADF;
- goto out_drop_write;
- }
-
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != file->f_path.mnt)
- goto out_fput;
-
- src = file_inode(src_file.file);
-
- ret = -EINVAL;
- if (src == inode)
- same_inode = 1;
-
- /* the src must be open for reading */
- if (!(src_file.file->f_mode & FMODE_READ))
- goto out_fput;
+ if (file_src->f_path.mnt != file->f_path.mnt ||
+ src->i_sb != inode->i_sb)
+ return -EXDEV;
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- goto out_fput;
+ return -EINVAL;
- ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
- goto out_fput;
-
- ret = -EXDEV;
- if (src->i_sb != inode->i_sb)
- goto out_fput;
+ return -EISDIR;
if (!same_inode) {
btrfs_double_inode_lock(src, inode);
btrfs_double_inode_unlock(src, inode);
else
mutex_unlock(&src->i_mutex);
-out_fput:
- fdput(src_file);
-out_drop_write:
- mnt_drop_write_file(file);
return ret;
}
-static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
{
- struct btrfs_ioctl_clone_range_args args;
+ ssize_t ret;
- if (copy_from_user(&args, argp, sizeof(args)))
- return -EFAULT;
- return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
- args.src_length, args.dest_offset);
+ ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
+ if (ret == 0)
+ ret = len;
+ return ret;
+}
+
+int btrfs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ return btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
/*
return -ENOMEM;
space_args.total_spaces = 0;
- dest = kmalloc(alloc_size, GFP_NOFS);
+ dest = kmalloc(alloc_size, GFP_KERNEL);
if (!dest)
return -ENOMEM;
dest_orig = dest;
goto out;
}
- size = min_t(u32, loi->size, 64 * 1024);
+ size = min_t(u32, loi->size, SZ_64K);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
goto out_bargs;
}
- bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
goto out_bargs;
goto out;
}
- bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
if (!bargs) {
ret = -ENOMEM;
goto out;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+ qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
if (!qsa)
return -ENOMEM;
goto out;
}
- args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
if (!args64) {
ret = -ENOMEM;
goto out;
static int btrfs_ioctl_get_supported_features(struct file *file,
void __user *arg)
{
- static struct btrfs_ioctl_feature_flags features[3] = {
+ static const struct btrfs_ioctl_feature_flags features[3] = {
INIT_FEATURE_FLAGS(SUPP),
INIT_FEATURE_FLAGS(SAFE_SET),
INIT_FEATURE_FLAGS(SAFE_CLEAR)
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
- case BTRFS_IOC_CLONE:
- return btrfs_ioctl_clone(file, arg, 0, 0, 0);
- case BTRFS_IOC_CLONE_RANGE:
- return btrfs_ioctl_clone_range(file, argp);
case BTRFS_IOC_TRANS_START:
return btrfs_ioctl_trans_start(file);
case BTRFS_IOC_TRANS_END:
return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
- case BTRFS_IOC_FILE_EXTENT_SAME:
- return btrfs_ioctl_file_extent_same(file, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(file, argp);
case BTRFS_IOC_GET_FEATURES:
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
- Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
- Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+ Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+ Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+ Opt_skip_balance, Opt_check_integrity,
+ Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_err,
};
- static match_table_t tokens = {
+ static const match_table_t tokens = {
{Opt_degraded, "degraded"},
{Opt_subvol, "subvol=%s"},
{Opt_subvolid, "subvolid=%s"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_space_cache, "space_cache"},
+ {Opt_space_cache_version, "space_cache=%s"},
{Opt_clear_cache, "clear_cache"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_enospc_debug, "enospc_debug"},
bool compress_force = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (cache_gen)
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+ else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
if (!options)
"turning off discard");
break;
case Opt_space_cache:
- btrfs_set_and_info(root, SPACE_CACHE,
- "enabling disk space caching");
+ case Opt_space_cache_version:
+ if (token == Opt_space_cache ||
+ strcmp(args[0].from, "v1") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ FREE_SPACE_TREE);
+ btrfs_set_and_info(root, SPACE_CACHE,
+ "enabling disk space caching");
+ } else if (strcmp(args[0].from, "v2") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ SPACE_CACHE);
+ btrfs_set_and_info(root, FREE_SPACE_TREE,
+ "enabling free space tree");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- btrfs_clear_and_info(root, SPACE_CACHE,
- "disabling disk space caching");
+ if (btrfs_test_opt(root, SPACE_CACHE)) {
+ btrfs_clear_and_info(root, SPACE_CACHE,
+ "disabling disk space caching");
+ }
+ if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(root, FREE_SPACE_TREE,
+ "disabling free space tree");
+ }
break;
case Opt_inode_cache:
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
}
}
out:
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, CLEAR_CACHE)) {
+ btrfs_err(root->fs_info, "cannot disable free space tree");
+ ret = -EINVAL;
+
+ }
if (!ret && btrfs_test_opt(root, SPACE_CACHE))
btrfs_info(root->fs_info, "disk space caching is enabled");
+ if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+ btrfs_info(root->fs_info, "using free space tree");
kfree(orig);
return ret;
}
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
+ else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+ seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, RESCAN_UUID_TREE))
if ((flags ^ s->s_flags) & MS_RDONLY)
error = -EBUSY;
} else {
- char b[BDEVNAME_SIZE];
-
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
- skip_space = 1024 * 1024;
+ skip_space = SZ_1M;
/* user can set the offset in fs_info->alloc_start. */
if (fs_info->alloc_start &&
* there are other factors that may change the result (like a new metadata
* chunk).
*
+ * If metadata is exhausted, f_bavail will be 0.
+ *
* FIXME: not accurate for mixed block groups, total and free/used are ok,
* available appears slightly larger.
*/
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
+ u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
+ u64 thresh = 0;
/*
* holding chunk_muext to avoid allocating new chunks, holding
}
}
}
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ total_free_meta += found->disk_total - found->disk_used;
total_used += found->disk_used;
}
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
+ /*
+ * We calculate the remaining metadata space minus global reserve. If
+ * this is (supposedly) smaller than zero, there's no space. But this
+ * does not hold in practice, the exhausted state happens where's still
+ * some positive delta. So we apply some guesswork and compare the
+ * delta to a 4M threshold. (Practically observed delta was ~2M.)
+ *
+ * We probably cannot calculate the exact threshold value because this
+ * depends on the internal reservations requested by various
+ * operations, so some operations that consume a few metadata will
+ * succeed even if the Avail is zero. But this is better than the other
+ * way around.
+ */
+ thresh = 4 * 1024 * 1024;
+
+ if (total_free_meta - thresh < block_rsv->size)
+ buf->f_bavail = 0;
+
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
if (ret)
goto out;
ret = btrfs_test_qgroups();
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree();
out:
btrfs_destroy_test_fs();
return ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* search for our xattrs */
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
return ret;
}
-/*
- * List of handlers for synthetic system.* attributes. All real ondisk
- * attributes are handled directly.
- */
-const struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- NULL,
-};
-
-/*
- * Check if the attribute is in a supported namespace.
- *
- * This is applied after the check for the synthetic attributes in the system
- * namespace.
- */
-static int btrfs_is_valid_xattr(const char *name)
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- int len = strlen(name);
- int prefixlen = 0;
-
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN))
- prefixlen = XATTR_SECURITY_PREFIX_LEN;
- else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- prefixlen = XATTR_SYSTEM_PREFIX_LEN;
- else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- prefixlen = XATTR_TRUSTED_PREFIX_LEN;
- else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- prefixlen = XATTR_USER_PREFIX_LEN;
- else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- prefixlen = XATTR_BTRFS_PREFIX_LEN;
- else
- return -EOPNOTSUPP;
-
- /*
- * The name cannot consist of just prefix
- */
- if (len <= prefixlen)
- return -EINVAL;
+ struct inode *inode = d_inode(dentry);
- return 0;
+ name = xattr_full_name(handler, name);
+ return __btrfs_getxattr(inode, name, buffer, size);
}
-ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size,
+ int flags)
{
- int ret;
+ struct inode *inode = d_inode(dentry);
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, buffer, size);
+ name = xattr_full_name(handler, name);
+ return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+}
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
- return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ name = xattr_full_name(handler, name);
+ return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
}
+static const struct xattr_handler btrfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+ .prefix = XATTR_BTRFS_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+ &btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &btrfs_trusted_xattr_handler,
+ &btrfs_user_xattr_handler,
+ &btrfs_btrfs_xattr_handler,
+ NULL,
+};
+
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- value, size, flags);
-
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
- flags);
+ return generic_setxattr(dentry, name, value, size, flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- NULL, 0, XATTR_REPLACE);
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
- XATTR_REPLACE);
+ return generic_removexattr(dentry, name);
}
static int btrfs_initxattrs(struct inode *inode,
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_NOFS);
+ strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
break;