mm: memcontrol: fix possible memcg leak due to interrupted reclaim
authorVladimir Davydov <vdavydov@virtuozzo.com>
Tue, 29 Dec 2015 22:54:10 +0000 (14:54 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Dec 2015 01:45:49 +0000 (17:45 -0800)
Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break()
once enough pages have been reclaimed, in which case, in contrast to a
full round-trip over a cgroup sub-tree, the current position stored in
mem_cgroup_reclaim_iter of the target cgroup does not get invalidated
and so is left holding the reference to the last scanned cgroup.  If the
target cgroup does not get scanned again (we might have just reclaimed
the last page or all processes might exit and free their memory
voluntary), we will leak it, because there is nobody to put the
reference held by the iterator.

The problem is easy to reproduce by running the following command
sequence in a loop:

    mkdir /sys/fs/cgroup/memory/test
    echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
    memhog 150M
    echo $$ > /sys/fs/cgroup/memory/cgroup.procs
    rmdir test

The cgroups generated by it will never get freed.

This patch fixes this issue by making mem_cgroup_iter avoid taking
reference to the current position.  In order not to hit use-after-free
bug while running reclaim in parallel with cgroup deletion, we make use
of ->css_released cgroup callback to clear references to the dying
cgroup in all reclaim iterators that might refer to it.  This callback
is called right before scheduling rcu work which will free css, so if we
access iter->position from rcu read section, we might be sure it won't
go away under us.

[hannes@cmpxchg.org: clean up css ref handling]
Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org> [3.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/memcontrol.c

index e234c21a5e6cb3d02104da2422548692fda1e881..fc10620967c79d7b8fbbcc0f82fd9804b1562f13 100644 (file)
@@ -903,14 +903,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                if (prev && reclaim->generation != iter->generation)
                        goto out_unlock;
 
-               do {
+               while (1) {
                        pos = READ_ONCE(iter->position);
+                       if (!pos || css_tryget(&pos->css))
+                               break;
                        /*
-                        * A racing update may change the position and
-                        * put the last reference, hence css_tryget(),
-                        * or retry to see the updated position.
+                        * css reference reached zero, so iter->position will
+                        * be cleared by ->css_released. However, we should not
+                        * rely on this happening soon, because ->css_released
+                        * is called from a work queue, and by busy-waiting we
+                        * might block it. So we clear iter->position right
+                        * away.
                         */
-               } while (pos && !css_tryget(&pos->css));
+                       (void)cmpxchg(&iter->position, pos, NULL);
+               }
        }
 
        if (pos)
@@ -956,17 +962,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        }
 
        if (reclaim) {
-               if (cmpxchg(&iter->position, pos, memcg) == pos) {
-                       if (memcg)
-                               css_get(&memcg->css);
-                       if (pos)
-                               css_put(&pos->css);
-               }
-
                /*
-                * pairs with css_tryget when dereferencing iter->position
-                * above.
+                * The position could have already been updated by a competing
+                * thread, so check that the value hasn't changed since we read
+                * it to avoid reclaiming from the same cgroup twice.
                 */
+               (void)cmpxchg(&iter->position, pos, memcg);
+
                if (pos)
                        css_put(&pos->css);
 
@@ -999,6 +1001,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
                css_put(&prev->css);
 }
 
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+       struct mem_cgroup *memcg = dead_memcg;
+       struct mem_cgroup_reclaim_iter *iter;
+       struct mem_cgroup_per_zone *mz;
+       int nid, zid;
+       int i;
+
+       while ((memcg = parent_mem_cgroup(memcg))) {
+               for_each_node(nid) {
+                       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                               mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+                               for (i = 0; i <= DEF_PRIORITY; i++) {
+                                       iter = &mz->iter[i];
+                                       cmpxchg(&iter->position,
+                                               dead_memcg, NULL);
+                               }
+                       }
+               }
+       }
+}
+
 /*
  * Iteration constructs for visiting all cgroups (under a tree).  If
  * loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -4324,6 +4348,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        wb_memcg_offline(memcg);
 }
 
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+       invalidate_reclaim_iterators(memcg);
+}
+
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5185,6 +5216,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
        .css_alloc = mem_cgroup_css_alloc,
        .css_online = mem_cgroup_css_online,
        .css_offline = mem_cgroup_css_offline,
+       .css_released = mem_cgroup_css_released,
        .css_free = mem_cgroup_css_free,
        .css_reset = mem_cgroup_css_reset,
        .can_attach = mem_cgroup_can_attach,