Merge branch 'acpi-ec'
[linux-drm-fsl-dcu.git] / kernel / rcu / tree.c
index 7680fc2750361a91fc8f749a006555899aa92677..48d640ca1a05b8c0f83fe2b217b925a6dec69fa4 100644 (file)
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
 /*
  * Let the RCU core know that this CPU has gone through the scheduler,
  * which is a quiescent state.  This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
+/*
+ * Register a quiesecent state for all RCU flavors.  If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them).  Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+       if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+               rcu_momentary_dyntick_idle();
+       this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
 static long blimit = 10;       /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;   /* If this many pending, ignore blimit. */
 static long qlowmark = 100;    /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 
 /*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+       return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+       return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
  */
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_started_bh(void)
+{
+       return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+       return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
 {
        return rcu_sched_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
 /*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
  */
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
 {
        return rcu_bh_state.completed;
 }
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
 /**
  * rcu_nmi_enter - inform RCU of entry to NMI context
  *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is active.
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active.  This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int.  (You will probably
+ * run out of stack space first.)
  */
 void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+       int incby = 2;
 
-       if (rdtp->dynticks_nmi_nesting == 0 &&
-           (atomic_read(&rdtp->dynticks) & 0x1))
-               return;
-       rdtp->dynticks_nmi_nesting++;
-       smp_mb__before_atomic();  /* Force delay from prior write. */
-       atomic_inc(&rdtp->dynticks);
-       /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
-       smp_mb__after_atomic();  /* See above. */
-       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+       /* Complain about underflow. */
+       WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+
+       /*
+        * If idle from RCU viewpoint, atomically increment ->dynticks
+        * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+        * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
+        * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+        * to be in the outermost NMI handler that interrupted an RCU-idle
+        * period (observation due to Andy Lutomirski).
+        */
+       if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+               smp_mb__before_atomic();  /* Force delay from prior write. */
+               atomic_inc(&rdtp->dynticks);
+               /* atomic_inc() before later RCU read-side crit sects */
+               smp_mb__after_atomic();  /* See above. */
+               WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+               incby = 1;
+       }
+       rdtp->dynticks_nmi_nesting += incby;
+       barrier();
 }
 
 /**
  * rcu_nmi_exit - inform RCU of exit from NMI context
  *
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is no longer active.
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
  */
 void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
-       if (rdtp->dynticks_nmi_nesting == 0 ||
-           --rdtp->dynticks_nmi_nesting != 0)
+       /*
+        * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+        * (We are exiting an NMI handler, so RCU better be paying attention
+        * to us!)
+        */
+       WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+
+       /*
+        * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+        * leave it in non-RCU-idle state.
+        */
+       if (rdtp->dynticks_nmi_nesting != 1) {
+               rdtp->dynticks_nmi_nesting -= 2;
                return;
+       }
+
+       /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+       rdtp->dynticks_nmi_nesting = 0;
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
        smp_mb__before_atomic();  /* See above. */
        atomic_inc(&rdtp->dynticks);
@@ -898,16 +989,13 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                return 1;
        } else {
+               if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+                                rdp->mynode->gpnum))
+                       ACCESS_ONCE(rdp->gpwrap) = true;
                return 0;
        }
 }
 
-/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
 /*
  * Return true if the specified CPU has passed through a quiescent
  * state by virtue of being in or having passed through an dynticks
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
        j1 = rcu_jiffies_till_stall_check();
        ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
        rsp->jiffies_resched = j + j1 / 2;
+       rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+}
+
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+       unsigned long gpa;
+       unsigned long j;
+
+       j = jiffies;
+       gpa = ACCESS_ONCE(rsp->gp_activity);
+       if (j - gpa > 2 * HZ)
+               pr_err("%s kthread starved for %ld jiffies!\n",
+                      rsp->name, j - gpa);
 }
 
 /*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
        }
 }
 
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
        int cpu;
        long delta;
        unsigned long flags;
+       unsigned long gpa;
+       unsigned long j;
        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
        long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
 
-       /*
-        * Now rat on any tasks that got kicked up to the root rcu_node
-        * due to CPU offlining.
-        */
-       rnp = rcu_get_root(rsp);
-       raw_spin_lock_irqsave(&rnp->lock, flags);
-       ndetected += rcu_print_task_stall(rnp);
-       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
               (long)rsp->gpnum, (long)rsp->completed, totqlen);
-       if (ndetected == 0)
-               pr_err("INFO: Stall ended before state dump start\n");
-       else
+       if (ndetected) {
                rcu_dump_cpu_stacks(rsp);
+       } else {
+               if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+                   ACCESS_ONCE(rsp->completed) == gpnum) {
+                       pr_err("INFO: Stall ended before state dump start\n");
+               } else {
+                       j = jiffies;
+                       gpa = ACCESS_ONCE(rsp->gp_activity);
+                       pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+                              rsp->name, j - gpa, j, gpa,
+                              jiffies_till_next_fqs);
+                       /* In this case, the current CPU might be at fault. */
+                       sched_show_task(current);
+               }
+       }
 
        /* Complain about tasks blocking the grace period. */
-
        rcu_print_detail_task_stall(rsp);
 
+       rcu_check_gp_kthread_starvation(rsp);
+
        force_quiescent_state(rsp);  /* Kick them all. */
 }
 
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
        pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
                jiffies - rsp->gp_start,
                (long)rsp->gpnum, (long)rsp->completed, totqlen);
+
+       rcu_check_gp_kthread_starvation(rsp);
+
        rcu_dump_cpu_stacks(rsp);
 
        raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
 
                /* They had a few time units to dump stack, so complain. */
-               print_other_cpu_stall(rsp);
+               print_other_cpu_stall(rsp, gpnum);
        }
 }
 
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
        bool ret;
 
        /* Handle the ends of any preceding grace periods first. */
-       if (rdp->completed == rnp->completed) {
+       if (rdp->completed == rnp->completed &&
+           !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
 
                /* No grace period end, so just accelerate recent callbacks. */
                ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
        }
 
-       if (rdp->gpnum != rnp->gpnum) {
+       if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /*
                 * If the current grace period is waiting for this CPU,
                 * set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                rdp->gpnum = rnp->gpnum;
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
                rdp->passed_quiesce = 0;
+               rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
+               ACCESS_ONCE(rdp->gpwrap) = false;
        }
        return ret;
 }
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-            rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+            rdp->completed == ACCESS_ONCE(rnp->completed) &&
+            !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+               ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
 
        mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        unsigned long maxj;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rsp->n_force_qs++;
        if (fqs_state == RCU_SAVE_DYNTICK) {
                /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
 
+       ACCESS_ONCE(rsp->gp_activity) = jiffies;
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
        gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+               ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        if (rcu_gp_init(rsp))
                                break;
                        cond_resched_rcu_qs();
+                       ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        WARN_ON(signal_pending(current));
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                       ACCESS_ONCE(rsp->gpnum),
                                                       TPS("fqsend"));
                                cond_resched_rcu_qs();
+                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        } else {
                                /* Deal with stray signal. */
                                cond_resched_rcu_qs();
+                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
                                WARN_ON(signal_pending(current));
                                trace_rcu_grace_period(rsp->name,
                                                       ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-       if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
-           rnp->completed == rnp->gpnum) {
+       if ((rdp->passed_quiesce == 0 &&
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+           rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+           rdp->gpwrap) {
 
                /*
                 * The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * within the current grace period.
                 */
                rdp->passed_quiesce = 0;        /* need qs for new gp. */
+               rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-       if (!rdp->passed_quiesce)
+       if (!rdp->passed_quiesce &&
+           rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
                return;
 
        /*
@@ -2194,6 +2323,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
                               TPS("cpuofl"));
 }
 
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section.  Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely.  That said, invoking it after the fact will cost you
+ * a needless lock acquisition.  So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+       long mask;
+       struct rcu_node *rnp = rnp_leaf;
+
+       if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+               return;
+       for (;;) {
+               mask = rnp->grpmask;
+               rnp = rnp->parent;
+               if (!rnp)
+                       break;
+               raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+               smp_mb__after_unlock_lock(); /* GP memory ordering. */
+               rnp->qsmaskinit &= ~mask;
+               if (rnp->qsmaskinit) {
+                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                       return;
+               }
+               raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+       }
+}
+
 /*
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
-       unsigned long mask;
-       int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
        rcu_adopt_orphan_cbs(rsp, flags);
+       raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
 
-       /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
-       mask = rdp->grpmask;    /* rnp->grplo is constant. */
-       do {
-               raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
-               smp_mb__after_unlock_lock();
-               rnp->qsmaskinit &= ~mask;
-               if (rnp->qsmaskinit != 0) {
-                       if (rnp != rdp->mynode)
-                               raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                       break;
-               }
-               if (rnp == rdp->mynode)
-                       need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-               else
-                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-               mask = rnp->grpmask;
-               rnp = rnp->parent;
-       } while (rnp != NULL);
-
-       /*
-        * We still hold the leaf rcu_node structure lock here, and
-        * irqs are still disabled.  The reason for this subterfuge is
-        * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
-        * held leads to deadlock.
-        */
-       raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
-       rnp = rdp->mynode;
-       if (need_report & RCU_OFL_TASKS_NORM_GP)
-               rcu_report_unblock_qs_rnp(rnp, flags);
-       else
-               raw_spin_unlock_irqrestore(&rnp->lock, flags);
-       if (need_report & RCU_OFL_TASKS_EXP_GP)
-               rcu_report_exp_rnp(rsp, rnp, true);
+       /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+       raw_spin_lock_irqsave(&rnp->lock, flags);
+       smp_mb__after_unlock_lock();    /* Enforce GP memory-order guarantee. */
+       rnp->qsmaskinit &= ~rdp->grpmask;
+       if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
+               rcu_cleanup_dead_rnp(rnp);
+       rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
                  cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
 
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
+
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-       rnp = rcu_get_root(rsp);
-       if (rnp->qsmask == 0) {
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               smp_mb__after_unlock_lock();
-               rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-       }
 }
 
 /*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  * Schedule RCU callback invocation.  If the specified type of RCU
  * does not support RCU priority boosting, just do a direct call,
  * otherwise wake up the per-CPU kernel kthread.  Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
  * rcu_cpu_kthread_task cannot disappear out from under us.
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
        /* Is the RCU core waiting for a quiescent state from this CPU? */
        if (rcu_scheduler_fully_active &&
-           rdp->qs_pending && !rdp->passed_quiesce) {
+           rdp->qs_pending && !rdp->passed_quiesce &&
+           rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
                rdp->n_rp_qs_pending++;
-       } else if (rdp->qs_pending && rdp->passed_quiesce) {
+       } else if (rdp->qs_pending &&
+                  (rdp->passed_quiesce ||
+                   rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        }
 
        /* Has a new RCU grace period started? */
-       if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+       if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+           unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
                rdp->n_rp_gp_started++;
                return 1;
        }
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        } else {
                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
                                                   rsp->n_barrier_done);
+                               smp_mb__before_atomic();
                                atomic_inc(&rsp->barrier_cpu_count);
                                __call_rcu(&rdp->barrier_head,
                                           rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-       init_callback_list(rdp);
-       rdp->qlen_lazy = 0;
-       ACCESS_ONCE(rdp->qlen) = 0;
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
                        rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
                        rdp->passed_quiesce = 0;
+                       rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                        rdp->qs_pending = 0;
                        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
                }
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
 static int __init rcu_spawn_gp_kthread(void)
 {
        unsigned long flags;
+       int kthread_prio_in = kthread_prio;
        struct rcu_node *rnp;
        struct rcu_state *rsp;
+       struct sched_param sp;
        struct task_struct *t;
 
+       /* Force priority into range. */
+       if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+               kthread_prio = 1;
+       else if (kthread_prio < 0)
+               kthread_prio = 0;
+       else if (kthread_prio > 99)
+               kthread_prio = 99;
+       if (kthread_prio != kthread_prio_in)
+               pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+                        kthread_prio, kthread_prio_in);
+
        rcu_scheduler_fully_active = 1;
        for_each_rcu_flavor(rsp) {
-               t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+               t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
                BUG_ON(IS_ERR(t));
                rnp = rcu_get_root(rsp);
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rsp->gp_kthread = t;
+               if (kthread_prio) {
+                       sp.sched_priority = kthread_prio;
+                       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+               }
+               wake_up_process(t);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        rcu_spawn_nocb_kthreads();