Move remote node draining out of slab allocators

author Christoph Lameter <clameter@sgi.com>

Wed, 9 May 2007 09:35:14 +0000 (02:35 -0700)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Wed, 9 May 2007 19:30:56 +0000 (12:30 -0700)
author Christoph Lameter <clameter@sgi.com>
Wed, 9 May 2007 09:35:14 +0000 (02:35 -0700)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Wed, 9 May 2007 19:30:56 +0000 (12:30 -0700)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h

index 97a36c3d96e2cb1d9981f1aa4a9b350138d268a0..0d2ef0b082a626853b7651914094cabf88672434 100644 (file)
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
  #define free_page(addr) free_pages((addr),0)
  
  void page_alloc_init(void);
-#ifdef CONFIG_NUMA
-void drain_node_pages(int node);
-#else
-static inline void drain_node_pages(int node) { };
-#endif
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
  
  #endif /* __LINUX_GFP_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 2f1544e83042af69d45862f216781de0fe22d9fe..d09b1345a3a14fcf7f5115a2d6efc092e25bd8ed 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
  
  struct per_cpu_pageset {
         struct per_cpu_pages pcp[2];    /* 0: hot.  1: cold */
+#ifdef CONFIG_NUMA
+       s8 expire;
+#endif
  #ifdef CONFIG_SMP
         s8 stat_threshold;
         s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index d53cbf8acb8e1018e36424cdefbeb92dcd548f15..f9b5d6d5f4d6cf7643d1c6f39b1d112e8bd18c33 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
  
  #ifdef CONFIG_NUMA
  /*
- * Called from the slab reaper to drain pagesets on a particular node that
- * belongs to the currently executing processor.
+ * Called from the vmstat counter updater to drain pagesets of this
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
   * Note that this function must be called with the thread pinned to
   * a single processor.
   */
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  {
-       int i;
-       enum zone_type z;
         unsigned long flags;
+       int to_drain;
  
-       for (z = 0; z < MAX_NR_ZONES; z++) {
-               struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
-               struct per_cpu_pageset *pset;
-
-               if (!populated_zone(zone))
-                       continue;
-
-               pset = zone_pcp(zone, smp_processor_id());
-               for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-                       struct per_cpu_pages *pcp;
-
-                       pcp = &pset->pcp[i];
-                       if (pcp->count) {
-                               int to_drain;
-
-                               local_irq_save(flags);
-                               if (pcp->count >= pcp->batch)
-                                       to_drain = pcp->batch;
-                               else
-                                       to_drain = pcp->count;
-                               free_pages_bulk(zone, to_drain, &pcp->list, 0);
-                               pcp->count -= to_drain;
-                               local_irq_restore(flags);
-                       }
-               }
-       }
+       local_irq_save(flags);
+       if (pcp->count >= pcp->batch)
+               to_drain = pcp->batch;
+       else
+               to_drain = pcp->count;
+       free_pages_bulk(zone, to_drain, &pcp->list, 0);
+       pcp->count -= to_drain;
+       local_irq_restore(flags);
  }
  #endif
  
diff --git a/mm/slab.c b/mm/slab.c

index e50908b2bfac8d33948fdb70422f98030e74f5d9..944b20581f8c421369828fb2090381ad53aa3969 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
  {
         int node = __get_cpu_var(reap_node);
  
-       /*
-        * Also drain per cpu pages on remote zones
-        */
-       if (node != numa_node_id())
-               drain_node_pages(node);
-
         node = next_node(node, node_online_map);
         if (unlikely(node >= MAX_NUMNODES))
                 node = first_node(node_online_map);
diff --git a/mm/slub.c b/mm/slub.c

index dbb206503a8d37ae75ed67d340de5224cf067479..bd2efae02bcd9cedf262b8dd6e2c79f3fd4b1bec 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
  
  #endif
  
-#ifdef CONFIG_NUMA
-
-/*****************************************************************
- * Generic reaper used to support the page allocator
- * (the cpu slabs are reaped by a per slab workqueue).
- *
- * Maybe move this to the page allocator?
- ****************************************************************/
-
-static DEFINE_PER_CPU(unsigned long, reap_node);
-
-static void init_reap_node(int cpu)
-{
-       int node;
-
-       node = next_node(cpu_to_node(cpu), node_online_map);
-       if (node == MAX_NUMNODES)
-               node = first_node(node_online_map);
-
-       __get_cpu_var(reap_node) = node;
-}
-
-static void next_reap_node(void)
-{
-       int node = __get_cpu_var(reap_node);
-
-       /*
-        * Also drain per cpu pages on remote zones
-        */
-       if (node != numa_node_id())
-               drain_node_pages(node);
-
-       node = next_node(node, node_online_map);
-       if (unlikely(node >= MAX_NUMNODES))
-               node = first_node(node_online_map);
-       __get_cpu_var(reap_node) = node;
-}
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-
-#define REAPTIMEOUT_CPUC       (2*HZ)
-
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
-
-static void cache_reap(struct work_struct *unused)
-{
-       next_reap_node();
-       schedule_delayed_work(&__get_cpu_var(reap_work),
-                                     REAPTIMEOUT_CPUC);
-}
-
-static void __devinit start_cpu_timer(int cpu)
-{
-       struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
-
-       /*
-        * When this gets called from do_initcalls via cpucache_init(),
-        * init_workqueues() has already run, so keventd will be setup
-        * at that time.
-        */
-       if (keventd_up() && reap_work->work.func == NULL) {
-               init_reap_node(cpu);
-               INIT_DELAYED_WORK(reap_work, cache_reap);
-               schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
-       }
-}
-
-static int __init cpucache_init(void)
-{
-       int cpu;
-
-       /*
-        * Register the timers that drain pcp pages and update vm statistics
-        */
-       for_each_online_cpu(cpu)
-               start_cpu_timer(cpu);
-       return 0;
-}
-__initcall(cpucache_init);
-#endif
-
  void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
  {
         struct kmem_cache *s = get_slab(size, gfpflags);
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 006eb7621869946141d7e159372d9661939f07a7..9832d9a41d8c721129a29f6b235b248b22de05dd 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
  
  /*
   * Update the zone counters for one cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
   */
  void refresh_cpu_vm_stats(int cpu)
  {
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
         unsigned long flags;
  
         for_each_zone(zone) {
-               struct per_cpu_pageset *pcp;
+               struct per_cpu_pageset *p;
  
                 if (!populated_zone(zone))
                         continue;
  
-               pcp = zone_pcp(zone, cpu);
+               p = zone_pcp(zone, cpu);
  
                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                       if (pcp->vm_stat_diff[i]) {
+                       if (p->vm_stat_diff[i]) {
                                 local_irq_save(flags);
-                               zone_page_state_add(pcp->vm_stat_diff[i],
+                               zone_page_state_add(p->vm_stat_diff[i],
                                         zone, i);
-                               pcp->vm_stat_diff[i] = 0;
+                               p->vm_stat_diff[i] = 0;
+#ifdef CONFIG_NUMA
+                               /* 3 seconds idle till flush */
+                               p->expire = 3;
+#endif
                                 local_irq_restore(flags);
                         }
+#ifdef CONFIG_NUMA
+               /*
+                * Deal with draining the remote pageset of this
+                * processor
+                *
+                * Check if there are pages remaining in this pageset
+                * if not then there is nothing to expire.
+                */
+               if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+                       continue;
+
+               /*
+                * We never drain zones local to this processor.
+                */
+               if (zone_to_nid(zone) == numa_node_id()) {
+                       p->expire = 0;
+                       continue;
+               }
+
+               p->expire--;
+               if (p->expire)
+                       continue;
+
+               if (p->pcp[0].count)
+                       drain_zone_pages(zone, p->pcp + 0);
+
+               if (p->pcp[1].count)
+                       drain_zone_pages(zone, p->pcp + 1);
+#endif
         }
  }
author	Christoph Lameter <clameter@sgi.com>
	Wed, 9 May 2007 09:35:14 +0000 (02:35 -0700)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Wed, 9 May 2007 19:30:56 +0000 (12:30 -0700)
include/linux/gfp.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/slub.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history