Merge branch 'for-linus' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 23 Oct 2015 22:20:57 +0000 (07:20 +0900)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 23 Oct 2015 22:20:57 +0000 (07:20 +0900)
Pull block layer fixes from Jens Axboe:
 "A final set of fixes for 4.3.

  It is (again) bigger than I would have liked, but it's all been
  through the testing mill and has been carefully reviewed by multiple
  parties.  Each fix is either a regression fix for this cycle, or is
  marked stable.  You can scold me at KS.  The pull request contains:

   - Three simple fixes for NVMe, fixing regressions since 4.3.  From
     Arnd, Christoph, and Keith.

   - A single xen-blkfront fix from Cathy, fixing a NULL dereference if
     an error is returned through the staste change callback.

   - Fixup for some bad/sloppy code in nbd that got introduced earlier
     in this cycle.  From Markus Pargmann.

   - A blk-mq tagset use-after-free fix from Junichi.

   - A backing device lifetime fix from Tejun, fixing a crash.

   - And finally, a set of regression/stable fixes for cgroup writeback
     from Tejun"

* 'for-linus' of git://git.kernel.dk/linux-block:
  writeback: remove broken rbtree_postorder_for_each_entry_safe() usage in cgwb_bdi_destroy()
  NVMe: Fix memory leak on retried commands
  block: don't release bdi while request_queue has live references
  nvme: use an integer value to Linux errno values
  blk-mq: fix use-after-free in blk_mq_free_tag_set()
  nvme: fix 32-bit build warning
  writeback: fix incorrect calculation of available memory for memcg domains
  writeback: memcg dirty_throttle_control should be initialized with wb->memcg_completions
  writeback: bdi_writeback iteration must not skip dying ones
  writeback: fix bdi_writeback iteration in wakeup_dirtytime_writeback()
  writeback: laptop_mode_timer_fn() needs rcu_read_lock() around bdi_writeback iteration
  nbd: Add locking for tasks
  xen-blkfront: check for null drvdata in blkback_changed (XenbusStateClosing)

14 files changed:
block/blk-core.c
block/blk-mq-tag.c
block/blk-mq.c
block/blk-sysfs.c
drivers/block/nbd.c
drivers/block/nvme-core.c
drivers/block/xen-blkfront.c
fs/fs-writeback.c
include/linux/backing-dev-defs.h
include/linux/backing-dev.h
include/linux/memcontrol.h
mm/backing-dev.c
mm/memcontrol.c
mm/page-writeback.c

index 2eb722d48773cb8a8de49d58b934eed830755da7..18e92a6645e24741b786bc35f14b9d06f1355569 100644 (file)
@@ -576,7 +576,7 @@ void blk_cleanup_queue(struct request_queue *q)
                q->queue_lock = &q->__queue_lock;
        spin_unlock_irq(lock);
 
-       bdi_destroy(&q->backing_dev_info);
+       bdi_unregister(&q->backing_dev_info);
 
        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
index ed96474d75cb62fb261526736727c67ea2238d46..ec2d11915142a8f9b7a49e839e41a2f54a55aa09 100644 (file)
@@ -641,6 +641,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
        bt_free(&tags->bitmap_tags);
        bt_free(&tags->breserved_tags);
+       free_cpumask_var(tags->cpumask);
        kfree(tags);
 }
 
index 7785ae96267a197926c700f74bcd6524892a8c01..85f014327342efc775c31833a52f531b69a66329 100644 (file)
@@ -2296,10 +2296,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
        int i;
 
        for (i = 0; i < set->nr_hw_queues; i++) {
-               if (set->tags[i]) {
+               if (set->tags[i])
                        blk_mq_free_rq_map(set, set->tags[i], i);
-                       free_cpumask_var(set->tags[i]->cpumask);
-               }
        }
 
        kfree(set->tags);
index 3e44a9da2a13579cacaee3d5e03bb3868f34d02a..07b42f5ad797b7b5e6367d5c70051d3b34fca55c 100644 (file)
@@ -540,6 +540,7 @@ static void blk_release_queue(struct kobject *kobj)
        struct request_queue *q =
                container_of(kobj, struct request_queue, kobj);
 
+       bdi_exit(&q->backing_dev_info);
        blkcg_exit_queue(q);
 
        if (q->elevator) {
index 293495a75d3d8ce1e0f61e3571ddcc0c1fa14312..1b87623381e2b1183b5c9d57c870b7c10924f65e 100644 (file)
@@ -60,6 +60,7 @@ struct nbd_device {
        bool disconnect; /* a disconnect has been requested by user */
 
        struct timer_list timeout_timer;
+       spinlock_t tasks_lock;
        struct task_struct *task_recv;
        struct task_struct *task_send;
 
@@ -140,21 +141,23 @@ static void sock_shutdown(struct nbd_device *nbd)
 static void nbd_xmit_timeout(unsigned long arg)
 {
        struct nbd_device *nbd = (struct nbd_device *)arg;
-       struct task_struct *task;
+       unsigned long flags;
 
        if (list_empty(&nbd->queue_head))
                return;
 
        nbd->disconnect = true;
 
-       task = READ_ONCE(nbd->task_recv);
-       if (task)
-               force_sig(SIGKILL, task);
+       spin_lock_irqsave(&nbd->tasks_lock, flags);
+
+       if (nbd->task_recv)
+               force_sig(SIGKILL, nbd->task_recv);
 
-       task = READ_ONCE(nbd->task_send);
-       if (task)
+       if (nbd->task_send)
                force_sig(SIGKILL, nbd->task_send);
 
+       spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
        dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
 }
 
@@ -403,17 +406,24 @@ static int nbd_thread_recv(struct nbd_device *nbd)
 {
        struct request *req;
        int ret;
+       unsigned long flags;
 
        BUG_ON(nbd->magic != NBD_MAGIC);
 
        sk_set_memalloc(nbd->sock->sk);
 
+       spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_recv = current;
+       spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
        ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
        if (ret) {
                dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+
+               spin_lock_irqsave(&nbd->tasks_lock, flags);
                nbd->task_recv = NULL;
+               spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
                return ret;
        }
 
@@ -429,7 +439,9 @@ static int nbd_thread_recv(struct nbd_device *nbd)
 
        device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
 
+       spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_recv = NULL;
+       spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
        if (signal_pending(current)) {
                siginfo_t info;
@@ -534,8 +546,11 @@ static int nbd_thread_send(void *data)
 {
        struct nbd_device *nbd = data;
        struct request *req;
+       unsigned long flags;
 
+       spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_send = current;
+       spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
        set_user_nice(current, MIN_NICE);
        while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
@@ -572,7 +587,15 @@ static int nbd_thread_send(void *data)
                nbd_handle_req(nbd, req);
        }
 
+       spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_send = NULL;
+       spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
+       /* Clear maybe pending signals */
+       if (signal_pending(current)) {
+               siginfo_t info;
+               dequeue_signal_lock(current, &current->blocked, &info);
+       }
 
        return 0;
 }
@@ -1052,6 +1075,7 @@ static int __init nbd_init(void)
                nbd_dev[i].magic = NBD_MAGIC;
                INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
                spin_lock_init(&nbd_dev[i].queue_lock);
+               spin_lock_init(&nbd_dev[i].tasks_lock);
                INIT_LIST_HEAD(&nbd_dev[i].queue_head);
                mutex_init(&nbd_dev[i].tx_lock);
                init_timer(&nbd_dev[i].timeout_timer);
index 6f04771f1019798cc2feabf73eff2ddbadc84b81..ccc0c1f93daa45fae2603dd8e36bd2dcc38a8217 100644 (file)
@@ -603,27 +603,31 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
        struct nvme_iod *iod = ctx;
        struct request *req = iod_get_private(iod);
        struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-
        u16 status = le16_to_cpup(&cqe->status) >> 1;
+       bool requeue = false;
+       int error = 0;
 
        if (unlikely(status)) {
                if (!(status & NVME_SC_DNR || blk_noretry_request(req))
                    && (jiffies - req->start_time) < req->timeout) {
                        unsigned long flags;
 
+                       requeue = true;
                        blk_mq_requeue_request(req);
                        spin_lock_irqsave(req->q->queue_lock, flags);
                        if (!blk_queue_stopped(req->q))
                                blk_mq_kick_requeue_list(req->q);
                        spin_unlock_irqrestore(req->q->queue_lock, flags);
-                       return;
+                       goto release_iod;
                }
 
                if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
                        if (cmd_rq->ctx == CMD_CTX_CANCELLED)
-                               status = -EINTR;
+                               error = -EINTR;
+                       else
+                               error = status;
                } else {
-                       status = nvme_error_status(status);
+                       error = nvme_error_status(status);
                }
        }
 
@@ -635,8 +639,9 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
        if (cmd_rq->aborted)
                dev_warn(nvmeq->dev->dev,
                        "completing aborted command with status:%04x\n",
-                       status);
+                       error);
 
+release_iod:
        if (iod->nents) {
                dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
                        rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
@@ -649,7 +654,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
        }
        nvme_free_iod(nvmeq->dev, iod);
 
-       blk_mq_complete_request(req, status);
+       if (likely(!requeue))
+               blk_mq_complete_request(req, error);
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@ -1804,7 +1810,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
        length = (io.nblocks + 1) << ns->lba_shift;
        meta_len = (io.nblocks + 1) * ns->ms;
-       metadata = (void __user *)(unsigned long)io.metadata;
+       metadata = (void __user *)(uintptr_t)io.metadata;
        write = io.opcode & 1;
 
        if (ns->ext) {
@@ -1844,7 +1850,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        c.rw.metadata = cpu_to_le64(meta_dma);
 
        status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
-                       (void __user *)io.addr, length, NULL, 0);
+                       (void __user *)(uintptr_t)io.addr, length, NULL, 0);
  unmap:
        if (meta) {
                if (status == NVME_SC_SUCCESS && !write) {
@@ -1886,7 +1892,7 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
                timeout = msecs_to_jiffies(cmd.timeout_ms);
 
        status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
-                       NULL, (void __user *)cmd.addr, cmd.data_len,
+                       NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
                        &cmd.result, timeout);
        if (status >= 0) {
                if (put_user(cmd.result, &ucmd->result))
index 611170896b8c94ce1d7494d62116ba1fde574fce..a69c02dadec05684f2a49eefe98033e8ebbc0c3e 100644 (file)
@@ -1956,7 +1956,8 @@ static void blkback_changed(struct xenbus_device *dev,
                        break;
                /* Missed the backend's Closing state -- fallthrough */
        case XenbusStateClosing:
-               blkfront_closing(info);
+               if (info)
+                       blkfront_closing(info);
                break;
        }
 }
index 091a36444972fa0b746118b0cc58ca2a3dcaa97f..29e4599f6fc1c20b73acb2c580ce8383e169c5f2 100644 (file)
@@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
 {
-       int next_memcg_id = 0;
-       struct bdi_writeback *wb;
-       struct wb_iter iter;
+       struct bdi_writeback *last_wb = NULL;
+       struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
+                                               struct bdi_writeback, bdi_node);
 
        might_sleep();
 restart:
        rcu_read_lock();
-       bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+       list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;
 
+               if (last_wb) {
+                       wb_put(last_wb);
+                       last_wb = NULL;
+               }
+
                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,12 +824,22 @@ restart:
 
                wb_queue_work(wb, work);
 
-               next_memcg_id = wb->memcg_css->id + 1;
+               /*
+                * Pin @wb so that it stays on @bdi->wb_list.  This allows
+                * continuing iteration from @wb after dropping and
+                * regrabbing rcu read lock.
+                */
+               wb_get(wb);
+               last_wb = wb;
+
                rcu_read_unlock();
                wb_wait_for_completion(bdi, &fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();
+
+       if (last_wb)
+               wb_put(last_wb);
 }
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
@@ -1857,12 +1872,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
-               struct wb_iter iter;
 
                if (!bdi_has_dirty_io(bdi))
                        continue;
 
-               bdi_for_each_wb(wb, bdi, &iter, 0)
+               list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
                                           false, reason);
        }
@@ -1894,11 +1908,10 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
-               struct wb_iter iter;
 
-               bdi_for_each_wb(wb, bdi, &iter, 0)
-                       if (!list_empty(&bdi->wb.b_dirty_time))
-                               wb_wakeup(&bdi->wb);
+               list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+                       if (!list_empty(&wb->b_dirty_time))
+                               wb_wakeup(wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
index a23209b43842106c6a74de8a51d6b4b752613157..1b4d69f68c33cc73ad99a1136b2408c71e763fb8 100644 (file)
@@ -116,6 +116,8 @@ struct bdi_writeback {
        struct list_head work_list;
        struct delayed_work dwork;      /* work item used for writeback */
 
+       struct list_head bdi_node;      /* anchored at bdi->wb_list */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;       /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
@@ -150,6 +152,7 @@ struct backing_dev_info {
        atomic_long_t tot_write_bandwidth;
 
        struct bdi_writeback wb;  /* the root writeback info for this bdi */
+       struct list_head wb_list; /* list of all wbs */
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct rb_root cgwb_congested_tree; /* their congested states */
index d5eb4ad1c534a8074b64ee75d4b597563e36e89a..c85f74946a8bab65ff3f16cddea6a4446b0a4799 100644 (file)
 #include <linux/slab.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
-void bdi_destroy(struct backing_dev_info *bdi);
+void bdi_exit(struct backing_dev_info *bdi);
 
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+void bdi_unregister(struct backing_dev_info *bdi);
+
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
+void bdi_destroy(struct backing_dev_info *bdi);
+
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
                        bool range_cyclic, enum wb_reason reason);
 void wb_start_background_writeback(struct bdi_writeback *wb);
@@ -408,61 +412,6 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
        rcu_read_unlock();
 }
 
-struct wb_iter {
-       int                     start_memcg_id;
-       struct radix_tree_iter  tree_iter;
-       void                    **slot;
-};
-
-static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
-                                                  struct backing_dev_info *bdi)
-{
-       struct radix_tree_iter *titer = &iter->tree_iter;
-
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (iter->start_memcg_id >= 0) {
-               iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
-               iter->start_memcg_id = -1;
-       } else {
-               iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
-       }
-
-       if (!iter->slot)
-               iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
-       if (iter->slot)
-               return *iter->slot;
-       return NULL;
-}
-
-static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
-                                                  struct backing_dev_info *bdi,
-                                                  int start_memcg_id)
-{
-       iter->start_memcg_id = start_memcg_id;
-
-       if (start_memcg_id)
-               return __wb_iter_next(iter, bdi);
-       else
-               return &bdi->wb;
-}
-
-/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
- * @wb_cur: cursor struct bdi_writeback pointer
- * @bdi: bdi to walk wb's of
- * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_memcg_id: memcg ID to start iteration from
- *
- * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
- * to be used as temp storage during iteration.  rcu_read_lock() must be
- * held throughout iteration.
- */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)             \
-       for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
-            (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
-
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -522,14 +471,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
-struct wb_iter {
-       int             next_id;
-};
-
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)             \
-       for ((iter)->next_id = (start_blkcg_id);                        \
-            ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
-
 static inline int inode_congested(struct inode *inode, int cong_bits)
 {
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
index 6452ff4c463fd8715edc9a5043bc812521a18d6a..3e3318ddfc0e3e09a0e15825f78eb6052d628d78 100644 (file)
@@ -676,8 +676,9 @@ enum {
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-                        unsigned long *pdirty, unsigned long *pwriteback);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+                        unsigned long *pheadroom, unsigned long *pdirty,
+                        unsigned long *pwriteback);
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
@@ -687,7 +688,8 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 }
 
 static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
-                                      unsigned long *pavail,
+                                      unsigned long *pfilepages,
+                                      unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
 {
index 2df8ddcb0ca0a7f7a055456de4b46a8c55bbfdf1..619984fc07ec32792349c7fe8aec7c8b56e3d2a3 100644 (file)
@@ -480,6 +480,10 @@ static void cgwb_release_workfn(struct work_struct *work)
                                                release_work);
        struct backing_dev_info *bdi = wb->bdi;
 
+       spin_lock_irq(&cgwb_lock);
+       list_del_rcu(&wb->bdi_node);
+       spin_unlock_irq(&cgwb_lock);
+
        wb_shutdown(wb);
 
        css_put(wb->memcg_css);
@@ -575,6 +579,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
                        atomic_inc(&bdi->usage_cnt);
+                       list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
                        css_get(memcg_css);
@@ -676,7 +681,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
 {
        struct radix_tree_iter iter;
-       struct bdi_writeback_congested *congested, *congested_n;
+       struct rb_node *rbn;
        void **slot;
 
        WARN_ON(test_bit(WB_registered, &bdi->wb.state));
@@ -686,9 +691,11 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
 
-       rbtree_postorder_for_each_entry_safe(congested, congested_n,
-                                       &bdi->cgwb_congested_tree, rb_node) {
-               rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
+       while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
+               struct bdi_writeback_congested *congested =
+                       rb_entry(rbn, struct bdi_writeback_congested, rb_node);
+
+               rb_erase(rbn, &bdi->cgwb_congested_tree);
                congested->bdi = NULL;  /* mark @congested unlinked */
        }
 
@@ -764,15 +771,22 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
 
 int bdi_init(struct backing_dev_info *bdi)
 {
+       int ret;
+
        bdi->dev = NULL;
 
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
        INIT_LIST_HEAD(&bdi->bdi_list);
+       INIT_LIST_HEAD(&bdi->wb_list);
        init_waitqueue_head(&bdi->wb_waitq);
 
-       return cgwb_bdi_init(bdi);
+       ret = cgwb_bdi_init(bdi);
+
+       list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+
+       return ret;
 }
 EXPORT_SYMBOL(bdi_init);
 
@@ -823,7 +837,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
        synchronize_rcu_expedited();
 }
 
-void bdi_destroy(struct backing_dev_info *bdi)
+void bdi_unregister(struct backing_dev_info *bdi)
 {
        /* make sure nobody finds us on the bdi_list anymore */
        bdi_remove_from_list(bdi);
@@ -835,9 +849,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
                device_unregister(bdi->dev);
                bdi->dev = NULL;
        }
+}
 
+void bdi_exit(struct backing_dev_info *bdi)
+{
+       WARN_ON_ONCE(bdi->dev);
        wb_exit(&bdi->wb);
 }
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+       bdi_unregister(bdi);
+       bdi_exit(bdi);
+}
 EXPORT_SYMBOL(bdi_destroy);
 
 /*
index d9b5c817dce8e0a99aab1853eafc572e5b70015f..c57c4423c68837d14816c5ff230435e1567e7c20 100644 (file)
@@ -3741,44 +3741,43 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 /**
  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
  * @wb: bdi_writeback in question
- * @pavail: out parameter for number of available pages
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
  * @pdirty: out parameter for number of dirty pages
  * @pwriteback: out parameter for number of pages under writeback
  *
- * Determine the numbers of available, dirty, and writeback pages in @wb's
- * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
- * more involved.
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
+ * is a bit more involved.
  *
- * A memcg's headroom is "min(max, high) - used".  The available memory is
- * calculated as the lowest headroom of itself and the ancestors plus the
- * number of pages already being used for file pages.  Note that this
- * doesn't consider the actual amount of available memory in the system.
- * The caller should further cap *@pavail accordingly.
+ * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors.  Note that this doesn't consider the actual amount of
+ * available memory in the system.  The caller should further cap
+ * *@pheadroom accordingly.
  */
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-                        unsigned long *pdirty, unsigned long *pwriteback)
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+                        unsigned long *pheadroom, unsigned long *pdirty,
+                        unsigned long *pwriteback)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
-       unsigned long head_room = PAGE_COUNTER_MAX;
-       unsigned long file_pages;
 
        *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 
        /* this should eventually include NR_UNSTABLE_NFS */
        *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+       *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                    (1 << LRU_ACTIVE_FILE));
+       *pheadroom = PAGE_COUNTER_MAX;
 
-       file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
-                                                   (1 << LRU_ACTIVE_FILE));
        while ((parent = parent_mem_cgroup(memcg))) {
                unsigned long ceiling = min(memcg->memory.limit, memcg->high);
                unsigned long used = page_counter_read(&memcg->memory);
 
-               head_room = min(head_room, ceiling - min(ceiling, used));
+               *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
                memcg = parent;
        }
-
-       *pavail = file_pages + head_room;
 }
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
index 0a931cdd4f6baaa96cdfab2e0dd668c0abef8809..2c90357c34ea4c4d81521b7bf14d669d6565cc07 100644 (file)
@@ -145,9 +145,6 @@ struct dirty_throttle_control {
        unsigned long           pos_ratio;
 };
 
-#define DTC_INIT_COMMON(__wb)  .wb = (__wb),                           \
-                               .wb_completions = &(__wb)->completions
-
 /*
  * Length of period for aging writeout fractions of bdis. This is an
  * arbitrarily chosen number. The longer the period, the slower fractions will
@@ -157,12 +154,16 @@ struct dirty_throttle_control {
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
-#define GDTC_INIT(__wb)                .dom = &global_wb_domain,               \
-                               DTC_INIT_COMMON(__wb)
+#define GDTC_INIT(__wb)                .wb = (__wb),                           \
+                               .dom = &global_wb_domain,               \
+                               .wb_completions = &(__wb)->completions
+
 #define GDTC_INIT_NO_WB                .dom = &global_wb_domain
-#define MDTC_INIT(__wb, __gdtc)        .dom = mem_cgroup_wb_domain(__wb),      \
-                               .gdtc = __gdtc,                         \
-                               DTC_INIT_COMMON(__wb)
+
+#define MDTC_INIT(__wb, __gdtc)        .wb = (__wb),                           \
+                               .dom = mem_cgroup_wb_domain(__wb),      \
+                               .wb_completions = &(__wb)->memcg_completions, \
+                               .gdtc = __gdtc
 
 static bool mdtc_valid(struct dirty_throttle_control *dtc)
 {
@@ -213,7 +214,8 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
-#define GDTC_INIT(__wb)                DTC_INIT_COMMON(__wb)
+#define GDTC_INIT(__wb)                .wb = (__wb),                           \
+                               .wb_completions = &(__wb)->completions
 #define GDTC_INIT_NO_WB
 #define MDTC_INIT(__wb, __gdtc)
 
@@ -682,13 +684,19 @@ static unsigned long hard_dirty_limit(struct wb_domain *dom,
        return max(thresh, dom->dirty_limit);
 }
 
-/* memory available to a memcg domain is capped by system-wide clean memory */
-static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+/*
+ * Memory which can be further allocated to a memcg domain is capped by
+ * system-wide clean memory excluding the amount being used in the domain.
+ */
+static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
+                           unsigned long filepages, unsigned long headroom)
 {
        struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
-       unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+       unsigned long clean = filepages - min(filepages, mdtc->dirty);
+       unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+       unsigned long other_clean = global_clean - min(global_clean, clean);
 
-       mdtc->avail = min(mdtc->avail, clean);
+       mdtc->avail = filepages + min(headroom, other_clean);
 }
 
 /**
@@ -1562,16 +1570,16 @@ static void balance_dirty_pages(struct address_space *mapping,
                }
 
                if (mdtc) {
-                       unsigned long writeback;
+                       unsigned long filepages, headroom, writeback;
 
                        /*
                         * If @wb belongs to !root memcg, repeat the same
                         * basic calculations for the memcg domain.
                         */
-                       mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
-                                           &writeback);
-                       mdtc_cap_avail(mdtc);
+                       mem_cgroup_wb_stats(wb, &filepages, &headroom,
+                                           &mdtc->dirty, &writeback);
                        mdtc->dirty += writeback;
+                       mdtc_calc_avail(mdtc, filepages, headroom);
 
                        domain_dirty_limits(mdtc);
 
@@ -1893,10 +1901,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
                return true;
 
        if (mdtc) {
-               unsigned long writeback;
+               unsigned long filepages, headroom, writeback;
 
-               mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
-               mdtc_cap_avail(mdtc);
+               mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
+                                   &writeback);
+               mdtc_calc_avail(mdtc, filepages, headroom);
                domain_dirty_limits(mdtc);      /* ditto, ignore writeback */
 
                if (mdtc->dirty > mdtc->bg_thresh)
@@ -1956,7 +1965,6 @@ void laptop_mode_timer_fn(unsigned long data)
        int nr_pages = global_page_state(NR_FILE_DIRTY) +
                global_page_state(NR_UNSTABLE_NFS);
        struct bdi_writeback *wb;
-       struct wb_iter iter;
 
        /*
         * We want to write everything out, not just down to the dirty
@@ -1965,10 +1973,12 @@ void laptop_mode_timer_fn(unsigned long data)
        if (!bdi_has_dirty_io(&q->backing_dev_info))
                return;
 
-       bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+       rcu_read_lock();
+       list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
                if (wb_has_dirty_io(wb))
                        wb_start_writeback(wb, nr_pages, true,
                                           WB_REASON_LAPTOP_TIMER);
+       rcu_read_unlock();
 }
 
 /*