Merge branch 'slab/next' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg...

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 13 Apr 2014 20:28:13 +0000 (13:28 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 13 Apr 2014 20:28:13 +0000 (13:28 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 13 Apr 2014 20:28:13 +0000 (13:28 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 13 Apr 2014 20:28:13 +0000 (13:28 -0700)
diff --combined include/linux/mm_types.h

index 2b58d192ea2401071c6a44d84286ba956acc939d,84b74080beb79418c7bb59e87106b050cefd2df2..8967e20cbe57aceb991acac4218eb5d638e867f5
--- 1/include/linux/mm_types.h
--- 2/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@@ -124,6 -124,8 +124,8 @@@ struct page 
         union {
                 struct list_head lru;   /* Pageout list, eg. active_list
                                          * protected by zone->lru_lock !
+                                        * Can be used as a generic list
+                                        * by the page owner.
                                          */
                 struct {                /* slub per cpu partial pages */
                         struct page *next;      /* Next partial slab */
@@@ -136,7 -138,6 +138,6 @@@
   #endif
                 };
   
-               struct list_head list;  /* slobs list of pages */
                 struct slab *slab_page; /* slab fields */
                 struct rcu_head rcu_head;       /* Used by SLAB
                                                  * when destroying via RCU
@@@ -342,9 -343,9 +343,9 @@@ struct mm_rss_stat 
   
   struct kioctx_table;
   struct mm_struct {
- -      struct vm_area_struct * mmap;           /* list of VMAs */
+ +      struct vm_area_struct *mmap;            /* list of VMAs */
         struct rb_root mm_rb;
- -      struct vm_area_struct * mmap_cache;     /* last find_vma result */
+ +      u32 vmacache_seqnum;                   /* per-thread vmacache */
   #ifdef CONFIG_MMU
         unsigned long (*get_unmapped_area) (struct file *filp,
                                 unsigned long addr, unsigned long len,
diff --combined include/linux/slab.h

index 3dd389aa91c7cc702c383c33efb20c3a79efc27a,5df89f777a549633c8fb1a33ca9ec8263fbb0ced..307bfbe62387ad5872c16bba320263fe83f1322d
--- 1/include/linux/slab.h
--- 2/include/linux/slab.h
+++ b/include/linux/slab.h
@@@ -115,9 -115,9 +115,9 @@@ int slab_is_available(void)
   struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
                         unsigned long,
                         void (*)(void *));
- -struct kmem_cache *
- -kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t,
- -                      unsigned long, void (*)(void *), struct kmem_cache *);
+ +#ifdef CONFIG_MEMCG_KMEM
+ +void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *);
+ +#endif
   void kmem_cache_destroy(struct kmem_cache *);
   int kmem_cache_shrink(struct kmem_cache *);
   void kmem_cache_free(struct kmem_cache *, void *);
@@@ -242,6 -242,17 +242,17 @@@ struct kmem_cache 
   #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
   #endif
   
+ /*
+  * This restriction comes from byte sized index implementation.
+  * Page size is normally 2^12 bytes and, in this case, if we want to use
+  * byte sized index which can represent 2^8 entries, the size of the object
+  * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
+  * If minimum size of kmalloc is less than 16, we use it as minimum object
+  * size and give up to use byte sized index.
+  */
+ #define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
+                                (KMALLOC_MIN_SIZE) : 16)
+ 
   #ifndef CONFIG_SLOB
   extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
   #ifdef CONFIG_ZONE_DMA
@@@ -410,7 -421,7 +421,7 @@@ static __always_inline void *kmalloc_la
    *
    * %GFP_NOWAIT - Allocation will not sleep.
    *
- - * %GFP_THISNODE - Allocate node-local memory only.
+ + * %__GFP_THISNODE - Allocate node-local memory only.
    *
    * %GFP_DMA - Allocation suitable for DMA.
    *   Should only be used for kmalloc() caches. Otherwise, use a
diff --combined mm/slab.c

index 3db4cb06e32eac698fcdb3d531d8c4f910824b8d,f6718197cdd0177c766481b33e0aac0844a7a52f..388cb1ae6fbc4907e6f0c6776b652adb5d055fe7
--- 1/mm/slab.c
--- 2/mm/slab.c
+++ b/mm/slab.c
@@@ -157,6 -157,17 +157,17 @@@
   #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
   #endif
   
+ #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
+                               <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
+ 
+ #if FREELIST_BYTE_INDEX
+ typedef unsigned char freelist_idx_t;
+ #else
+ typedef unsigned short freelist_idx_t;
+ #endif
+ 
+ #define SLAB_OBJ_MAX_NUM (1 << sizeof(freelist_idx_t) * BITS_PER_BYTE)
+ 
   /*
    * true if a page was allocated from pfmemalloc reserves for network-based
    * swap
@@@ -277,8 -288,8 +288,8 @@@ static void kmem_cache_node_init(struc
    * OTOH the cpuarrays can contain lots of objects,
    * which could lock up otherwise freeable slabs.
    */
- #define REAPTIMEOUT_CPUC      (2*HZ)
- #define REAPTIMEOUT_LIST3     (4*HZ)
+ #define REAPTIMEOUT_AC                (2*HZ)
+ #define REAPTIMEOUT_NODE      (4*HZ)
   
   #if STATS
   #define       STATS_INC_ACTIVE(x)     ((x)->num_active++)
@@@ -565,9 -576,31 +576,31 @@@ static inline struct array_cache *cpu_c
         return cachep->array[smp_processor_id()];
   }
   
- static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
+                               size_t idx_size, size_t align)
   {
-       return ALIGN(nr_objs * sizeof(unsigned int), align);
+       int nr_objs;
+       size_t freelist_size;
+ 
+       /*
+        * Ignore padding for the initial guess. The padding
+        * is at most @align-1 bytes, and @buffer_size is at
+        * least @align. In the worst case, this result will
+        * be one greater than the number of objects that fit
+        * into the memory allocation when taking the padding
+        * into account.
+        */
+       nr_objs = slab_size / (buffer_size + idx_size);
+ 
+       /*
+        * This calculated number will be either the right
+        * amount, or one greater than what we want.
+        */
+       freelist_size = slab_size - nr_objs * buffer_size;
+       if (freelist_size < ALIGN(nr_objs * idx_size, align))
+               nr_objs--;
+ 
+       return nr_objs;
   }
   
   /*
@@@ -600,25 -633,9 +633,9 @@@ static void cache_estimate(unsigned lon
                 nr_objs = slab_size / buffer_size;
   
         } else {
-               /*
-                * Ignore padding for the initial guess. The padding
-                * is at most @align-1 bytes, and @buffer_size is at
-                * least @align. In the worst case, this result will
-                * be one greater than the number of objects that fit
-                * into the memory allocation when taking the padding
-                * into account.
-                */
-               nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
- 
-               /*
-                * This calculated number will be either the right
-                * amount, or one greater than what we want.
-                */
-               if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
-                      > slab_size)
-                       nr_objs--;
- 
-               mgmt_size = slab_mgmt_size(nr_objs, align);
+               nr_objs = calculate_nr_objs(slab_size, buffer_size,
+                                       sizeof(freelist_idx_t), align);
+               mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align);
         }
         *num = nr_objs;
         *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@@ -1067,7 -1084,7 +1084,7 @@@ static int init_cache_node_node(int nod
   
         list_for_each_entry(cachep, &slab_caches, list) {
                 /*
-                * Set up the size64 kmemlist for cpu before we can
+                * Set up the kmem_cache_node for cpu before we can
                  * begin anything. Make sure some other cpu on this
                  * node has not already allocated this
                  */
@@@ -1076,12 -1093,12 +1093,12 @@@
                         if (!n)
                                 return -ENOMEM;
                         kmem_cache_node_init(n);
-                       n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                           ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                       n->next_reap = jiffies + REAPTIMEOUT_NODE +
+                           ((unsigned long)cachep) % REAPTIMEOUT_NODE;
   
                         /*
-                        * The l3s don't come and go as CPUs come and
-                        * go.  slab_mutex is sufficient
+                        * The kmem_cache_nodes don't come and go as CPUs
+                        * come and go.  slab_mutex is sufficient
                          * protection here.
                          */
                         cachep->node[node] = n;
@@@ -1406,8 -1423,8 +1423,8 @@@ static void __init set_up_node(struct k
         for_each_online_node(node) {
                 cachep->node[node] = &init_kmem_cache_node[index + node];
                 cachep->node[node]->next_reap = jiffies +
-                   REAPTIMEOUT_LIST3 +
-                   ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                   REAPTIMEOUT_NODE +
+                   ((unsigned long)cachep) % REAPTIMEOUT_NODE;
         }
   }
   
@@@ -2010,6 -2027,10 +2027,10 @@@ static size_t calculate_slab_order(stru
                 if (!num)
                         continue;
   
+               /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
+               if (num > SLAB_OBJ_MAX_NUM)
+                       break;
+ 
                 if (flags & CFLGS_OFF_SLAB) {
                         /*
                          * Max number of objs-per-slab for caches which
@@@ -2017,7 -2038,7 +2038,7 @@@
                          * looping condition in cache_grow().
                          */
                         offslab_limit = size;
-                       offslab_limit /= sizeof(unsigned int);
+                       offslab_limit /= sizeof(freelist_idx_t);
   
                         if (num > offslab_limit)
                                 break;
@@@ -2103,8 -2124,8 +2124,8 @@@ static int __init_refok setup_cpu_cache
                 }
         }
         cachep->node[numa_mem_id()]->next_reap =
-                       jiffies + REAPTIMEOUT_LIST3 +
-                       ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                       jiffies + REAPTIMEOUT_NODE +
+                       ((unsigned long)cachep) % REAPTIMEOUT_NODE;
   
         cpu_cache_get(cachep)->avail = 0;
         cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@@ -2243,7 -2264,7 +2264,7 @@@ __kmem_cache_create (struct kmem_cache 
          * it too early on. Always use on-slab management when
          * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
          */
-       if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+       if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
             !(flags & SLAB_NOLEAKTRACE))
                 /*
                  * Size is large, assume best to place the slab management obj
@@@ -2252,6 -2273,12 +2273,12 @@@
                 flags |= CFLGS_OFF_SLAB;
   
         size = ALIGN(size, cachep->align);
+       /*
+        * We should restrict the number of objects in a slab to implement
+        * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
+        */
+       if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
+               size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
   
         left_over = calculate_slab_order(cachep, size, cachep->align, flags);
   
@@@ -2259,7 -2286,7 +2286,7 @@@
                 return -E2BIG;
   
         freelist_size =
-               ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
+               ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
   
         /*
          * If the slab has been placed off-slab, and we have enough space then
@@@ -2272,7 -2299,7 +2299,7 @@@
   
         if (flags & CFLGS_OFF_SLAB) {
                 /* really off slab. No need for manual alignment */
-               freelist_size = cachep->num * sizeof(unsigned int);
+               freelist_size = cachep->num * sizeof(freelist_idx_t);
   
   #ifdef CONFIG_PAGE_POISONING
                 /* If we're going to use the generic kernel_map_pages()
@@@ -2300,10 -2327,10 +2327,10 @@@
         if (flags & CFLGS_OFF_SLAB) {
                 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
                 /*
-                * This is a possibility for one of the malloc_sizes caches.
+                * This is a possibility for one of the kmalloc_{dma,}_caches.
                  * But since we go off slab only for object size greater than
-                * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
-                * this should not happen at all.
+                * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
+                * in ascending order,this should not happen at all.
                  * But leave a BUG_ON for some lucky dude.
                  */
                 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
@@@ -2511,14 -2538,17 +2538,17 @@@ int __kmem_cache_shutdown(struct kmem_c
   
   /*
    * Get the memory for a slab management obj.
-  * For a slab cache when the slab descriptor is off-slab, slab descriptors
-  * always come from malloc_sizes caches.  The slab descriptor cannot
-  * come from the same cache which is getting created because,
-  * when we are searching for an appropriate cache for these
-  * descriptors in kmem_cache_create, we search through the malloc_sizes array.
-  * If we are creating a malloc_sizes cache here it would not be visible to
-  * kmem_find_general_cachep till the initialization is complete.
-  * Hence we cannot have freelist_cache same as the original cache.
+  *
+  * For a slab cache when the slab descriptor is off-slab, the
+  * slab descriptor can't come from the same cache which is being created,
+  * Because if it is the case, that means we defer the creation of
+  * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
+  * And we eventually call down to __kmem_cache_create(), which
+  * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+  * This is a "chicken-and-egg" problem.
+  *
+  * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
+  * which are all initialized during kmem_cache_init().
    */
   static void *alloc_slabmgmt(struct kmem_cache *cachep,
                                    struct page *page, int colour_off,
@@@ -2542,9 -2572,15 +2572,15 @@@
         return freelist;
   }
   
- static inline unsigned int *slab_freelist(struct page *page)
+ static inline freelist_idx_t get_free_obj(struct page *page, unsigned char idx)
   {
-       return (unsigned int *)(page->freelist);
+       return ((freelist_idx_t *)page->freelist)[idx];
+ }
+ 
+ static inline void set_free_obj(struct page *page,
+                                       unsigned char idx, freelist_idx_t val)
+ {
+       ((freelist_idx_t *)(page->freelist))[idx] = val;
   }
   
   static void cache_init_objs(struct kmem_cache *cachep,
@@@ -2589,7 -2625,7 +2625,7 @@@
                 if (cachep->ctor)
                         cachep->ctor(objp);
   #endif
-               slab_freelist(page)[i] = i;
+               set_free_obj(page, i, i);
         }
   }
   
@@@ -2608,7 -2644,7 +2644,7 @@@ static void *slab_get_obj(struct kmem_c
   {
         void *objp;
   
-       objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]);
+       objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
         page->active++;
   #if DEBUG
         WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
@@@ -2629,7 -2665,7 +2665,7 @@@ static void slab_put_obj(struct kmem_ca
   
         /* Verify double free bug */
         for (i = page->active; i < cachep->num; i++) {
-               if (slab_freelist(page)[i] == objnr) {
+               if (get_free_obj(page, i) == objnr) {
                         printk(KERN_ERR "slab: double free detected in cache "
                                         "'%s', objp %p\n", cachep->name, objp);
                         BUG();
@@@ -2637,7 -2673,7 +2673,7 @@@
         }
   #endif
         page->active--;
-       slab_freelist(page)[page->active] = objnr;
+       set_free_obj(page, page->active, objnr);
   }
   
   /*
@@@ -2886,9 -2922,9 +2922,9 @@@ retry
                 /* move slabp to correct slabp list: */
                 list_del(&page->lru);
                 if (page->active == cachep->num)
-                       list_add(&page->list, &n->slabs_full);
+                       list_add(&page->lru, &n->slabs_full);
                 else
-                       list_add(&page->list, &n->slabs_partial);
+                       list_add(&page->lru, &n->slabs_partial);
         }
   
   must_grow:
@@@ -3027,7 -3063,7 +3063,7 @@@ out
   
   #ifdef CONFIG_NUMA
   /*
- - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ + * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
    *
    * If we are in_interrupt, then process context, including cpusets and
    * mempolicy, may not apply and should not be used for allocation policy.
@@@ -3042,7 -3078,7 +3078,7 @@@ static void *alternate_node_alloc(struc
         if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                 nid_alloc = cpuset_slab_spread_node();
         else if (current->mempolicy)
- -              nid_alloc = slab_node();
+ +              nid_alloc = mempolicy_slab_node();
         if (nid_alloc != nid_here)
                 return ____cache_alloc_node(cachep, flags, nid_alloc);
         return NULL;
@@@ -3073,8 -3109,8 +3109,8 @@@ static void *fallback_alloc(struct kmem
         local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
   
   retry_cpuset:
- -      cpuset_mems_cookie = get_mems_allowed();
- -      zonelist = node_zonelist(slab_node(), flags);
+ +      cpuset_mems_cookie = read_mems_allowed_begin();
+ +      zonelist = node_zonelist(mempolicy_slab_node(), flags);
   
   retry:
         /*
@@@ -3131,7 -3167,7 +3167,7 @@@
                 }
         }
   
- -      if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+ +      if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
                 goto retry_cpuset;
         return obj;
   }
@@@ -3245,11 -3281,11 +3281,11 @@@ slab_alloc_node(struct kmem_cache *cach
         kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
                                  flags);
   
-       if (likely(ptr))
+       if (likely(ptr)) {
                 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
- 
-       if (unlikely((flags & __GFP_ZERO) && ptr))
-               memset(ptr, 0, cachep->object_size);
+               if (unlikely(flags & __GFP_ZERO))
+                       memset(ptr, 0, cachep->object_size);
+       }
   
         return ptr;
   }
@@@ -3259,7 -3295,7 +3295,7 @@@ __do_cache_alloc(struct kmem_cache *cac
   {
         void *objp;
   
- -      if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+ +      if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
                 objp = alternate_node_alloc(cache, flags);
                 if (objp)
                         goto out;
@@@ -3310,17 -3346,17 +3346,17 @@@ slab_alloc(struct kmem_cache *cachep, g
                                  flags);
         prefetchw(objp);
   
-       if (likely(objp))
+       if (likely(objp)) {
                 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
- 
-       if (unlikely((flags & __GFP_ZERO) && objp))
-               memset(objp, 0, cachep->object_size);
+               if (unlikely(flags & __GFP_ZERO))
+                       memset(objp, 0, cachep->object_size);
+       }
   
         return objp;
   }
   
   /*
-  * Caller needs to acquire correct kmem_list's list_lock
+  * Caller needs to acquire correct kmem_cache_node's list_lock
    */
   static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
                        int node)
@@@ -3574,11 -3610,6 +3610,6 @@@ static __always_inline void *__do_kmall
         struct kmem_cache *cachep;
         void *ret;
   
-       /* If you want to save a few bytes .text space: replace
-        * __ with kmem_.
-        * Then kmalloc uses the uninlined functions instead of the inline
-        * functions.
-        */
         cachep = kmalloc_slab(size, flags);
         if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                 return cachep;
@@@ -3670,7 -3701,7 +3701,7 @@@ EXPORT_SYMBOL(kfree)
   /*
    * This initializes kmem_cache_node or resizes various caches for all nodes.
    */
- static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
+ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
   {
         int node;
         struct kmem_cache_node *n;
@@@ -3726,8 -3757,8 +3757,8 @@@
                 }
   
                 kmem_cache_node_init(n);
-               n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                               ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+               n->next_reap = jiffies + REAPTIMEOUT_NODE +
+                               ((unsigned long)cachep) % REAPTIMEOUT_NODE;
                 n->shared = new_shared;
                 n->alien = new_alien;
                 n->free_limit = (1 + nr_cpus_node(node)) *
@@@ -3813,7 -3844,7 +3844,7 @@@ static int __do_tune_cpucache(struct km
                 kfree(ccold);
         }
         kfree(new);
-       return alloc_kmemlist(cachep, gfp);
+       return alloc_kmem_cache_node(cachep, gfp);
   }
   
   static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@@ -3982,7 -4013,7 +4013,7 @@@ static void cache_reap(struct work_stru
                 if (time_after(n->next_reap, jiffies))
                         goto next;
   
-               n->next_reap = jiffies + REAPTIMEOUT_LIST3;
+               n->next_reap = jiffies + REAPTIMEOUT_NODE;
   
                 drain_array(searchp, n, n->shared, 0, node);
   
@@@ -4003,7 -4034,7 +4034,7 @@@ next
         next_reap_node();
   out:
         /* Set up the next iteration */
-       schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
+       schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
   }
   
   #ifdef CONFIG_SLABINFO
@@@ -4210,7 -4241,7 +4241,7 @@@ static void handle_slab(unsigned long *
   
                 for (j = page->active; j < c->num; j++) {
                         /* Skip freed item */
-                       if (slab_freelist(page)[j] == i) {
+                       if (get_free_obj(page, j) == i) {
                                 active = false;
                                 break;
                         }
diff --combined mm/slub.c

index f620bbf4054aa0c1cc771873a8fe3463856f2fcc,591bf985aed0a502fc494039aae4c698c212d708..5e234f1f8853e952dceefe8c6b92201fcc3853d7
--- 1/mm/slub.c
--- 2/mm/slub.c
+++ b/mm/slub.c
@@@ -224,11 -224,7 +224,11 @@@ static inline void memcg_propagate_slab
   static inline void stat(const struct kmem_cache *s, enum stat_item si)
   {
   #ifdef CONFIG_SLUB_STATS
- -      __this_cpu_inc(s->cpu_slab->stat[si]);
+ +      /*
+ +       * The rmw is racy on a preemptible kernel but this is acceptable, so
+ +       * avoid this_cpu_add()'s irq-disable overhead.
+ +       */
+ +      raw_cpu_inc(s->cpu_slab->stat[si]);
   #endif
   }
   
@@@ -1008,19 -1004,21 +1008,19 @@@ static inline void slab_free_hook(struc
   static void add_full(struct kmem_cache *s,
         struct kmem_cache_node *n, struct page *page)
   {
- -      lockdep_assert_held(&n->list_lock);
- -
         if (!(s->flags & SLAB_STORE_USER))
                 return;
   
+ +      lockdep_assert_held(&n->list_lock);
         list_add(&page->lru, &n->full);
   }
   
   static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
   {
- -      lockdep_assert_held(&n->list_lock);
- -
         if (!(s->flags & SLAB_STORE_USER))
                 return;
   
+ +      lockdep_assert_held(&n->list_lock);
         list_del(&page->lru);
   }
   
@@@ -1352,11 -1350,12 +1352,12 @@@ static struct page *allocate_slab(struc
         page = alloc_slab_page(alloc_gfp, node, oo);
         if (unlikely(!page)) {
                 oo = s->min;
+               alloc_gfp = flags;
                 /*
                  * Allocation may have failed due to fragmentation.
                  * Try a lower order alloc if possible
                  */
-               page = alloc_slab_page(flags, node, oo);
+               page = alloc_slab_page(alloc_gfp, node, oo);
   
                 if (page)
                         stat(s, ORDER_FALLBACK);
@@@ -1366,7 -1365,7 +1367,7 @@@
                 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                 int pages = 1 << oo_order(oo);
   
-               kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
+               kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
   
                 /*
                  * Objects from caches that have a constructor don't get
@@@ -1522,9 -1521,11 +1523,9 @@@ static void discard_slab(struct kmem_ca
   /*
    * Management of partially allocated slabs.
    */
- -static inline void add_partial(struct kmem_cache_node *n,
- -                              struct page *page, int tail)
+ +static inline void
+ +__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
   {
- -      lockdep_assert_held(&n->list_lock);
- -
         n->nr_partial++;
         if (tail == DEACTIVATE_TO_TAIL)
                 list_add_tail(&page->lru, &n->partial);
@@@ -1532,27 -1533,15 +1533,27 @@@
                 list_add(&page->lru, &n->partial);
   }
   
- -static inline void remove_partial(struct kmem_cache_node *n,
- -                                      struct page *page)
+ +static inline void add_partial(struct kmem_cache_node *n,
+ +                              struct page *page, int tail)
   {
         lockdep_assert_held(&n->list_lock);
+ +      __add_partial(n, page, tail);
+ +}
   
+ +static inline void
+ +__remove_partial(struct kmem_cache_node *n, struct page *page)
+ +{
         list_del(&page->lru);
         n->nr_partial--;
   }
   
+ +static inline void remove_partial(struct kmem_cache_node *n,
+ +                                      struct page *page)
+ +{
+ +      lockdep_assert_held(&n->list_lock);
+ +      __remove_partial(n, page);
+ +}
+ +
   /*
    * Remove slab from the partial list, freeze it and
    * return the pointer to the freelist.
@@@ -1688,8 -1677,8 +1689,8 @@@ static void *get_any_partial(struct kme
                 return NULL;
   
         do {
- -              cpuset_mems_cookie = get_mems_allowed();
- -              zonelist = node_zonelist(slab_node(), flags);
+ +              cpuset_mems_cookie = read_mems_allowed_begin();
+ +              zonelist = node_zonelist(mempolicy_slab_node(), flags);
                 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                         struct kmem_cache_node *n;
   
@@@ -1700,17 -1689,19 +1701,17 @@@
                                 object = get_partial_node(s, n, c, flags);
                                 if (object) {
                                         /*
- -                                       * Return the object even if
- -                                       * put_mems_allowed indicated that
- -                                       * the cpuset mems_allowed was
- -                                       * updated in parallel. It's a
- -                                       * harmless race between the alloc
- -                                       * and the cpuset update.
+ +                                       * Don't check read_mems_allowed_retry()
+ +                                       * here - if mems_allowed was updated in
+ +                                       * parallel, that was a harmless race
+ +                                       * between allocation and the cpuset
+ +                                       * update
                                          */
- -                                      put_mems_allowed(cpuset_mems_cookie);
                                         return object;
                                 }
                         }
                 }
- -      } while (!put_mems_allowed(cpuset_mems_cookie));
+ +      } while (read_mems_allowed_retry(cpuset_mems_cookie));
   #endif
         return NULL;
   }
@@@ -2916,10 -2907,12 +2917,10 @@@ static void early_kmem_cache_node_alloc
         inc_slabs_node(kmem_cache_node, node, page->objects);
   
         /*
- -       * the lock is for lockdep's sake, not for any actual
- -       * race protection
+ +       * No locks need to be taken here as it has just been
+ +       * initialized and there is no concurrent access.
          */
- -      spin_lock(&n->list_lock);
- -      add_partial(n, page, DEACTIVATE_TO_HEAD);
- -      spin_unlock(&n->list_lock);
+ +      __add_partial(n, page, DEACTIVATE_TO_HEAD);
   }
   
   static void free_kmem_cache_nodes(struct kmem_cache *s)
@@@ -3205,7 -3198,7 +3206,7 @@@ static void free_partial(struct kmem_ca
   
         list_for_each_entry_safe(page, h, &n->partial, lru) {
                 if (!page->inuse) {
- -                      remove_partial(n, page);
+ +                      __remove_partial(n, page);
                         discard_slab(s, page);
                 } else {
                         list_slab_objects(s, page,
@@@ -3241,9 -3234,8 +3242,9 @@@ int __kmem_cache_shutdown(struct kmem_c
   
         if (!rc) {
                 /*
- -               * We do the same lock strategy around sysfs_slab_add, see
- -               * __kmem_cache_create. Because this is pretty much the last
+ +               * Since slab_attr_store may take the slab_mutex, we should
+ +               * release the lock while removing the sysfs entry in order to
+ +               * avoid a deadlock. Because this is pretty much the last
                  * operation we do and the lock will be released shortly after
                  * that in slab_common.c, we could just move sysfs_slab_remove
                  * to a later point in common code. We should do that when we
@@@ -3689,9 -3681,6 +3690,9 @@@ static int slab_unmergeable(struct kmem
         if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
                 return 1;
   
+ +      if (!is_root_cache(s))
+ +              return 1;
+ +
         if (s->ctor)
                 return 1;
   
@@@ -3704,8 -3693,9 +3705,8 @@@
         return 0;
   }
   
- -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
- -              size_t align, unsigned long flags, const char *name,
- -              void (*ctor)(void *))
+ +static struct kmem_cache *find_mergeable(size_t size, size_t align,
+ +              unsigned long flags, const char *name, void (*ctor)(void *))
   {
         struct kmem_cache *s;
   
@@@ -3728,7 -3718,7 +3729,7 @@@
                         continue;
   
                 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
- -                              continue;
+ +                      continue;
                 /*
                  * Check if alignment is compatible.
                  * Courtesy of Adrian Drzewiecki
@@@ -3739,24 -3729,23 +3740,24 @@@
                 if (s->size - size >= sizeof(void *))
                         continue;
   
- -              if (!cache_match_memcg(s, memcg))
- -                      continue;
- -
                 return s;
         }
         return NULL;
   }
   
   struct kmem_cache *
- -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- -                 size_t align, unsigned long flags, void (*ctor)(void *))
+ +__kmem_cache_alias(const char *name, size_t size, size_t align,
+ +                 unsigned long flags, void (*ctor)(void *))
   {
         struct kmem_cache *s;
   
- -      s = find_mergeable(memcg, size, align, flags, name, ctor);
+ +      s = find_mergeable(size, align, flags, name, ctor);
         if (s) {
+ +              int i;
+ +              struct kmem_cache *c;
+ +
                 s->refcount++;
+ +
                 /*
                  * Adjust the object sizes so that we clear
                  * the complete object on kzalloc.
@@@ -3764,15 -3753,6 +3765,15 @@@
                 s->object_size = max(s->object_size, (int)size);
                 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
   
+ +              for_each_memcg_cache_index(i) {
+ +                      c = cache_from_memcg_idx(s, i);
+ +                      if (!c)
+ +                              continue;
+ +                      c->object_size = s->object_size;
+ +                      c->inuse = max_t(int, c->inuse,
+ +                                       ALIGN(size, sizeof(void *)));
+ +              }
+ +
                 if (sysfs_slab_alias(s, name)) {
                         s->refcount--;
                         s = NULL;
@@@ -3795,7 -3775,10 +3796,7 @@@ int __kmem_cache_create(struct kmem_cac
                 return 0;
   
         memcg_propagate_slab_attrs(s);
- -      mutex_unlock(&slab_mutex);
         err = sysfs_slab_add(s);
- -      mutex_lock(&slab_mutex);
- -
         if (err)
                 kmem_cache_close(s);
   
@@@ -5142,15 -5125,6 +5143,15 @@@ static const struct kset_uevent_ops sla
   
   static struct kset *slab_kset;
   
+ +static inline struct kset *cache_kset(struct kmem_cache *s)
+ +{
+ +#ifdef CONFIG_MEMCG_KMEM
+ +      if (!is_root_cache(s))
+ +              return s->memcg_params->root_cache->memcg_kset;
+ +#endif
+ +      return slab_kset;
+ +}
+ +
   #define ID_STR_LENGTH 64
   
   /* Create a unique string id for a slab cache:
@@@ -5216,39 -5190,26 +5217,39 @@@ static int sysfs_slab_add(struct kmem_c
                 name = create_unique_id(s);
         }
   
- -      s->kobj.kset = slab_kset;
+ +      s->kobj.kset = cache_kset(s);
         err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- -      if (err) {
- -              kobject_put(&s->kobj);
- -              return err;
- -      }
+ +      if (err)
+ +              goto out_put_kobj;
   
         err = sysfs_create_group(&s->kobj, &slab_attr_group);
- -      if (err) {
- -              kobject_del(&s->kobj);
- -              kobject_put(&s->kobj);
- -              return err;
+ +      if (err)
+ +              goto out_del_kobj;
+ +
+ +#ifdef CONFIG_MEMCG_KMEM
+ +      if (is_root_cache(s)) {
+ +              s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+ +              if (!s->memcg_kset) {
+ +                      err = -ENOMEM;
+ +                      goto out_del_kobj;
+ +              }
         }
+ +#endif
+ +
         kobject_uevent(&s->kobj, KOBJ_ADD);
         if (!unmergeable) {
                 /* Setup first alias */
                 sysfs_slab_alias(s, s->name);
- -              kfree(name);
         }
- -      return 0;
+ +out:
+ +      if (!unmergeable)
+ +              kfree(name);
+ +      return err;
+ +out_del_kobj:
+ +      kobject_del(&s->kobj);
+ +out_put_kobj:
+ +      kobject_put(&s->kobj);
+ +      goto out;
   }
   
   static void sysfs_slab_remove(struct kmem_cache *s)
@@@ -5260,9 -5221,6 +5261,9 @@@
                  */
                 return;
   
+ +#ifdef CONFIG_MEMCG_KMEM
+ +      kset_unregister(s->memcg_kset);
+ +#endif
         kobject_uevent(&s->kobj, KOBJ_REMOVE);
         kobject_del(&s->kobj);
         kobject_put(&s->kobj);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 13 Apr 2014 20:28:13 +0000 (13:28 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 13 Apr 2014 20:28:13 +0000 (13:28 -0700)
		1	2
include/linux/mm_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/slab.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slab.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history