Merge tag 'powerpc-3.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux
[linux-drm-fsl-dcu.git] / kernel / events / core.c
index 934687f8d51b53fa19c6222398ecce63f9202534..8812d8e35f5b03b13e148ff67ae33453cad306c1 100644 (file)
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
                pmu->pmu_enable(pmu);
 }
 
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 
 /*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
  */
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
 {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-       struct list_head *head = this_cpu_ptr(&rotation_list);
+       struct list_head *head = this_cpu_ptr(&active_ctx_list);
 
        WARN_ON(!irqs_disabled());
 
-       if (list_empty(&cpuctx->rotation_list))
-               list_add(&cpuctx->rotation_list, head);
+       WARN_ON(!list_empty(&ctx->active_ctx_list));
+
+       list_add(&ctx->active_ctx_list, head);
+}
+
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+       WARN_ON(!irqs_disabled());
+
+       WARN_ON(list_empty(&ctx->active_ctx_list));
+
+       list_del_init(&ctx->active_ctx_list);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -906,6 +916,84 @@ static void put_ctx(struct perf_event_context *ctx)
        }
 }
 
+/*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
+ * perf_pmu_migrate_context() we need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
+ * concerned with cpuctx and that doesn't have children.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ *   perf_remove_from_context();
+ *   synchronize_rcu();
+ *   perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ *     task_struct::perf_event_mutex
+ *       perf_event_context::mutex
+ *         perf_event_context::lock
+ *         perf_event::child_mutex;
+ *         perf_event::mmap_mutex
+ *         mmap_sem
+ */
+static struct perf_event_context *
+perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
+{
+       struct perf_event_context *ctx;
+
+again:
+       rcu_read_lock();
+       ctx = ACCESS_ONCE(event->ctx);
+       if (!atomic_inc_not_zero(&ctx->refcount)) {
+               rcu_read_unlock();
+               goto again;
+       }
+       rcu_read_unlock();
+
+       mutex_lock_nested(&ctx->mutex, nesting);
+       if (event->ctx != ctx) {
+               mutex_unlock(&ctx->mutex);
+               put_ctx(ctx);
+               goto again;
+       }
+
+       return ctx;
+}
+
+static inline struct perf_event_context *
+perf_event_ctx_lock(struct perf_event *event)
+{
+       return perf_event_ctx_lock_nested(event, 0);
+}
+
+static void perf_event_ctx_unlock(struct perf_event *event,
+                                 struct perf_event_context *ctx)
+{
+       mutex_unlock(&ctx->mutex);
+       put_ctx(ctx);
+}
+
 /*
  * This must be done under the ctx->lock, such as to serialize against
  * context_equiv(), therefore we cannot call put_ctx() since that might end up
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_branch_stack++;
 
        list_add_rcu(&event->event_entry, &ctx->event_list);
-       if (!ctx->nr_events)
-               perf_pmu_rotate_start(ctx->pmu);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
        if (group_leader == event)
                return;
 
+       WARN_ON_ONCE(group_leader->ctx != event->ctx);
+
        if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
                        !is_software_event(event))
                group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
        struct perf_cpu_context *cpuctx;
+
+       WARN_ON_ONCE(event->ctx != ctx);
+       lockdep_assert_held(&ctx->lock);
+
        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
 
                /* Inherit group flags from the previous leader */
                sibling->group_flags = event->group_flags;
+
+               WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
 
 out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
 {
        u64 tstamp = perf_event_time(event);
        u64 delta;
+
+       WARN_ON_ONCE(event->ctx != ctx);
+       lockdep_assert_held(&ctx->lock);
+
        /*
         * An event which could not be activated because of
         * filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
 
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
-       ctx->nr_active--;
+       if (!--ctx->nr_active)
+               perf_event_ctx_deactivate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
  * is the current context on this CPU and preemption is disabled,
  * hence we can't get into perf_event_task_sched_out for this context.
  */
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+       ctx = perf_event_ctx_lock(event);
+       _perf_event_disable(event);
+       perf_event_ctx_unlock(event, ctx);
+}
 EXPORT_SYMBOL_GPL(perf_event_disable);
 
 static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
 
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
-       ctx->nr_active++;
+       if (!ctx->nr_active++)
+               perf_event_ctx_activate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq++;
 
@@ -2158,7 +2271,7 @@ unlock:
  * perf_event_for_each_child or perf_event_for_each as described
  * for perf_event_disable.
  */
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
+
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+       struct perf_event_context *ctx;
+
+       ctx = perf_event_ctx_lock(event);
+       _perf_event_enable(event);
+       perf_event_ctx_unlock(event, ctx);
+}
 EXPORT_SYMBOL_GPL(perf_event_enable);
 
-int perf_event_refresh(struct perf_event *event, int refresh)
+static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
        /*
         * not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
                return -EINVAL;
 
        atomic_add(refresh, &event->event_limit);
-       perf_event_enable(event);
+       _perf_event_enable(event);
 
        return 0;
 }
+
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+       struct perf_event_context *ctx;
+       int ret;
+
+       ctx = perf_event_ctx_lock(event);
+       ret = _perf_event_refresh(event, refresh);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
+}
 EXPORT_SYMBOL_GPL(perf_event_refresh);
 
 static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
-
-       /*
-        * Since these rotations are per-cpu, we need to ensure the
-        * cpu-context we got scheduled on is actually rotating.
-        */
-       perf_pmu_rotate_start(ctx->pmu);
 }
 
 /*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
                list_rotate_left(&ctx->flexible_groups);
 }
 
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
-       int rotate = 0, remove = 1;
+       int rotate = 0;
 
        if (cpuctx->ctx.nr_events) {
-               remove = 0;
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                        rotate = 1;
        }
 
        ctx = cpuctx->task_ctx;
        if (ctx && ctx->nr_events) {
-               remove = 0;
                if (ctx->nr_events != ctx->nr_active)
                        rotate = 1;
        }
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
-       if (remove)
-               list_del_init(&cpuctx->rotation_list);
 
        return rotate;
 }
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
 
 void perf_event_task_tick(void)
 {
-       struct list_head *head = this_cpu_ptr(&rotation_list);
-       struct perf_cpu_context *cpuctx, *tmp;
-       struct perf_event_context *ctx;
+       struct list_head *head = this_cpu_ptr(&active_ctx_list);
+       struct perf_event_context *ctx, *tmp;
        int throttled;
 
        WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
 
-       list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
-               ctx = &cpuctx->ctx;
+       list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
                perf_adjust_freq_unthr_context(ctx, throttled);
-
-               ctx = cpuctx->task_ctx;
-               if (ctx)
-                       perf_adjust_freq_unthr_context(ctx, throttled);
-       }
 }
 
 static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
+       INIT_LIST_HEAD(&ctx->active_ctx_list);
        INIT_LIST_HEAD(&ctx->pinned_groups);
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
        rcu_read_unlock();
 
        if (owner) {
-               mutex_lock(&owner->perf_event_mutex);
+               /*
+                * If we're here through perf_event_exit_task() we're already
+                * holding ctx->mutex which would be an inversion wrt. the
+                * normal lock order.
+                *
+                * However we can safely take this lock because its the child
+                * ctx->mutex.
+                */
+               mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
+
                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
  */
 static void put_event(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
+       struct perf_event_context *ctx;
 
        if (!atomic_long_dec_and_test(&event->refcount))
                return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
        if (!is_kernel_event(event))
                perf_remove_from_owner(event);
 
-       WARN_ON_ONCE(ctx->parent_ctx);
        /*
         * There are two ways this annotation is useful:
         *
@@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event)
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
         */
-       mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+       ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
+       WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, true);
        mutex_unlock(&ctx->mutex);
 
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
 {
        struct perf_event *leader = event->group_leader, *sub;
-       int n = 0, size = 0, ret = -EFAULT;
        struct perf_event_context *ctx = leader->ctx;
-       u64 values[5];
+       int n = 0, size = 0, ret;
        u64 count, enabled, running;
+       u64 values[5];
+
+       lockdep_assert_held(&ctx->mutex);
 
-       mutex_lock(&ctx->mutex);
        count = perf_event_read_value(leader, &enabled, &running);
 
        values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
        size = n * sizeof(u64);
 
        if (copy_to_user(buf, values, size))
-               goto unlock;
+               return -EFAULT;
 
        ret = size;
 
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
                size = n * sizeof(u64);
 
                if (copy_to_user(buf + ret, values, size)) {
-                       ret = -EFAULT;
-                       goto unlock;
+                       return -EFAULT;
                }
 
                ret += size;
        }
-unlock:
-       mutex_unlock(&ctx->mutex);
 
        return ret;
 }
@@ -3660,8 +3786,14 @@ static ssize_t
 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
        struct perf_event *event = file->private_data;
+       struct perf_event_context *ctx;
+       int ret;
 
-       return perf_read_hw(event, buf, count);
+       ctx = perf_event_ctx_lock(event);
+       ret = perf_read_hw(event, buf, count);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
 }
 
 static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        return events;
 }
 
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
 {
        (void)perf_event_read(event);
        local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
        struct perf_event *child;
 
        WARN_ON_ONCE(event->ctx->parent_ctx);
+
        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;
 
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
+       lockdep_assert_held(&ctx->mutex);
+
        event = event->group_leader;
 
        perf_event_for_each_child(event, func);
        list_for_each_entry(sibling, &event->sibling_list, group_entry)
                perf_event_for_each_child(sibling, func);
-       mutex_unlock(&ctx->mutex);
 }
 
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
-       struct perf_event *event = file->private_data;
        void (*func)(struct perf_event *);
        u32 flags = arg;
 
        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
-               func = perf_event_enable;
+               func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
-               func = perf_event_disable;
+               func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
-               func = perf_event_reset;
+               func = _perf_event_reset;
                break;
 
        case PERF_EVENT_IOC_REFRESH:
-               return perf_event_refresh(event, arg);
+               return _perf_event_refresh(event, arg);
 
        case PERF_EVENT_IOC_PERIOD:
                return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        return 0;
 }
 
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct perf_event *event = file->private_data;
+       struct perf_event_context *ctx;
+       long ret;
+
+       ctx = perf_event_ctx_lock(event);
+       ret = _perf_ioctl(event, cmd, arg);
+       perf_event_ctx_unlock(event, ctx);
+
+       return ret;
+}
+
 #ifdef CONFIG_COMPAT
 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
 
 int perf_event_task_enable(void)
 {
+       struct perf_event_context *ctx;
        struct perf_event *event;
 
        mutex_lock(&current->perf_event_mutex);
-       list_for_each_entry(event, &current->perf_event_list, owner_entry)
-               perf_event_for_each_child(event, perf_event_enable);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+               ctx = perf_event_ctx_lock(event);
+               perf_event_for_each_child(event, _perf_event_enable);
+               perf_event_ctx_unlock(event, ctx);
+       }
        mutex_unlock(&current->perf_event_mutex);
 
        return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
 
 int perf_event_task_disable(void)
 {
+       struct perf_event_context *ctx;
        struct perf_event *event;
 
        mutex_lock(&current->perf_event_mutex);
-       list_for_each_entry(event, &current->perf_event_list, owner_entry)
-               perf_event_for_each_child(event, perf_event_disable);
+       list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+               ctx = perf_event_ctx_lock(event);
+               perf_event_for_each_child(event, _perf_event_disable);
+               perf_event_ctx_unlock(event, ctx);
+       }
        mutex_unlock(&current->perf_event_mutex);
 
        return 0;
@@ -4461,18 +4613,14 @@ perf_output_sample_regs(struct perf_output_handle *handle,
 }
 
 static void perf_sample_regs_user(struct perf_regs *regs_user,
-                                 struct pt_regs *regs)
+                                 struct pt_regs *regs,
+                                 struct pt_regs *regs_user_copy)
 {
-       if (!user_mode(regs)) {
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-
-       if (regs) {
-               regs_user->abi  = perf_reg_abi(current);
+       if (user_mode(regs)) {
+               regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
+       } else if (current->mm) {
+               perf_get_regs_user(regs_user, regs, regs_user_copy);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
@@ -4951,7 +5099,8 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
 
        if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
-               perf_sample_regs_user(&data->regs_user, regs);
+               perf_sample_regs_user(&data->regs_user, regs,
+                                     &data->regs_user_copy);
 
        if (sample_type & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
@@ -5892,6 +6041,8 @@ end:
        rcu_read_unlock();
 }
 
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
 int perf_swevent_get_recursion_context(void)
 {
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5907,21 +6058,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
        put_recursion_context(swhash->recursion, rctx);
 }
 
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data;
-       int rctx;
 
-       preempt_disable_notrace();
-       rctx = perf_swevent_get_recursion_context();
-       if (rctx < 0)
+       if (WARN_ON_ONCE(!regs))
                return;
 
        perf_sample_data_init(&data, addr, 0);
-
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+       int rctx;
+
+       preempt_disable_notrace();
+       rctx = perf_swevent_get_recursion_context();
+       if (unlikely(rctx < 0))
+               goto fail;
+
+       ___perf_sw_event(event_id, nr, regs, addr);
 
        perf_swevent_put_recursion_context(rctx);
+fail:
        preempt_enable_notrace();
 }
 
@@ -6779,12 +6939,10 @@ skip_type:
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
-               cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
 
                __perf_cpu_hrtimer_init(cpuctx, cpu);
 
-               INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
 
@@ -6857,6 +7015,20 @@ void perf_pmu_unregister(struct pmu *pmu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
+static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+{
+       int ret;
+
+       if (!try_module_get(pmu->module))
+               return -ENODEV;
+       event->pmu = pmu;
+       ret = pmu->event_init(event);
+       if (ret)
+               module_put(pmu->module);
+
+       return ret;
+}
+
 struct pmu *perf_init_event(struct perf_event *event)
 {
        struct pmu *pmu = NULL;
@@ -6869,24 +7041,14 @@ struct pmu *perf_init_event(struct perf_event *event)
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
        if (pmu) {
-               if (!try_module_get(pmu->module)) {
-                       pmu = ERR_PTR(-ENODEV);
-                       goto unlock;
-               }
-               event->pmu = pmu;
-               ret = pmu->event_init(event);
+               ret = perf_try_init_event(pmu, event);
                if (ret)
                        pmu = ERR_PTR(ret);
                goto unlock;
        }
 
        list_for_each_entry_rcu(pmu, &pmus, entry) {
-               if (!try_module_get(pmu->module)) {
-                       pmu = ERR_PTR(-ENODEV);
-                       goto unlock;
-               }
-               event->pmu = pmu;
-               ret = pmu->event_init(event);
+               ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;
 
@@ -7250,6 +7412,15 @@ out:
        return ret;
 }
 
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+       if (b < a)
+               swap(a, b);
+
+       mutex_lock(a);
+       mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -7265,7 +7436,7 @@ SYSCALL_DEFINE5(perf_event_open,
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
-       struct perf_event_context *ctx;
+       struct perf_event_context *ctx, *uninitialized_var(gctx);
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
@@ -7423,7 +7594,19 @@ SYSCALL_DEFINE5(perf_event_open,
                 * task or CPU context:
                 */
                if (move_group) {
-                       if (group_leader->ctx->type != ctx->type)
+                       /*
+                        * Make sure we're both on the same task, or both
+                        * per-cpu events.
+                        */
+                       if (group_leader->ctx->task != ctx->task)
+                               goto err_context;
+
+                       /*
+                        * Make sure we're both events for the same CPU;
+                        * grouping events for different CPUs is broken; since
+                        * you can never concurrently schedule them anyhow.
+                        */
+                       if (group_leader->cpu != event->cpu)
                                goto err_context;
                } else {
                        if (group_leader->ctx != ctx)
@@ -7451,43 +7634,68 @@ SYSCALL_DEFINE5(perf_event_open,
        }
 
        if (move_group) {
-               struct perf_event_context *gctx = group_leader->ctx;
-
-               mutex_lock(&gctx->mutex);
-               perf_remove_from_context(group_leader, false);
+               gctx = group_leader->ctx;
 
                /*
-                * Removing from the context ends up with disabled
-                * event. What we want here is event in the initial
-                * startup state, ready to be add into new context.
+                * See perf_event_ctx_lock() for comments on the details
+                * of swizzling perf_event::ctx.
                 */
-               perf_event__state_init(group_leader);
+               mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+               perf_remove_from_context(group_leader, false);
+
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
                        perf_remove_from_context(sibling, false);
-                       perf_event__state_init(sibling);
                        put_ctx(gctx);
                }
-               mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
+       } else {
+               mutex_lock(&ctx->mutex);
        }
 
        WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
 
        if (move_group) {
+               /*
+                * Wait for everybody to stop referencing the events through
+                * the old lists, before installing it on new lists.
+                */
                synchronize_rcu();
-               perf_install_in_context(ctx, group_leader, group_leader->cpu);
-               get_ctx(ctx);
+
+               /*
+                * Install the group siblings before the group leader.
+                *
+                * Because a group leader will try and install the entire group
+                * (through the sibling list, which is still in-tact), we can
+                * end up with siblings installed in the wrong context.
+                *
+                * By installing siblings first we NO-OP because they're not
+                * reachable through the group lists.
+                */
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
+                       perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
                }
+
+               /*
+                * Removing from the context ends up with disabled
+                * event. What we want here is event in the initial
+                * startup state, ready to be add into new context.
+                */
+               perf_event__state_init(group_leader);
+               perf_install_in_context(ctx, group_leader, group_leader->cpu);
+               get_ctx(ctx);
        }
 
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
+
+       if (move_group) {
+               mutex_unlock(&gctx->mutex);
+               put_ctx(gctx);
+       }
        mutex_unlock(&ctx->mutex);
 
        put_online_cpus();
@@ -7595,7 +7803,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
 
-       mutex_lock(&src_ctx->mutex);
+       /*
+        * See perf_event_ctx_lock() for comments on the details
+        * of swizzling perf_event::ctx.
+        */
+       mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
                perf_remove_from_context(event, false);
@@ -7603,11 +7815,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
        }
-       mutex_unlock(&src_ctx->mutex);
 
+       /*
+        * Wait for the events to quiesce before re-instating them.
+        */
        synchronize_rcu();
 
-       mutex_lock(&dst_ctx->mutex);
+       /*
+        * Re-instate events in 2 passes.
+        *
+        * Skip over group leaders and only install siblings on this first
+        * pass, siblings will not get enabled without a leader, however a
+        * leader will enable its siblings, even if those are still on the old
+        * context.
+        */
+       list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+               if (event->group_leader == event)
+                       continue;
+
+               list_del(&event->migrate_entry);
+               if (event->state >= PERF_EVENT_STATE_OFF)
+                       event->state = PERF_EVENT_STATE_INACTIVE;
+               account_event_cpu(event, dst_cpu);
+               perf_install_in_context(dst_ctx, event, dst_cpu);
+               get_ctx(dst_ctx);
+       }
+
+       /*
+        * Once all the siblings are setup properly, install the group leaders
+        * to make it go.
+        */
        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7617,6 +7854,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
                get_ctx(dst_ctx);
        }
        mutex_unlock(&dst_ctx->mutex);
+       mutex_unlock(&src_ctx->mutex);
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
@@ -7803,14 +8041,19 @@ static void perf_free_event(struct perf_event *event,
 
        put_event(parent);
 
+       raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
+       raw_spin_unlock_irq(&ctx->lock);
        free_event(event);
 }
 
 /*
- * free an unexposed, unused context as created by inheritance by
+ * Free an unexposed, unused context as created by inheritance by
  * perf_event_init_task below, used by fork() in case of fail.
+ *
+ * Not all locks are strictly required, but take them anyway to be nice and
+ * help out with the lockdep assertions.
  */
 void perf_event_free_task(struct task_struct *task)
 {
@@ -8129,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void)
        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
-               INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+               INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
        }
 }
 
@@ -8150,22 +8393,11 @@ static void perf_event_init_cpu(int cpu)
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-       WARN_ON(!irqs_disabled());
-
-       list_del_init(&cpuctx->rotation_list);
-}
-
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
        struct perf_event_context *ctx = __info;
 
-       perf_pmu_rotate_stop(ctx->pmu);
-
        rcu_read_lock();
        list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
                __perf_remove_from_context(&re);