KVM: PPC: Book3S HV: Make tbacct_lock irq-safe
authorPaul Mackerras <paulus@samba.org>
Sat, 16 Nov 2013 06:46:04 +0000 (17:46 +1100)
committerAlexander Graf <agraf@suse.de>
Mon, 18 Nov 2013 21:39:23 +0000 (22:39 +0100)
Lockdep reported that there is a potential for deadlock because
vcpu->arch.tbacct_lock is not irq-safe, and is sometimes taken inside
the rq_lock (run-queue lock) in the scheduler, which is taken within
interrupts.  The lockdep splat looks like:

======================================================
[ INFO: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected ]
3.12.0-rc5-kvm+ #8 Not tainted
------------------------------------------------------
qemu-system-ppc/4803 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
(&(&vcpu->arch.tbacct_lock)->rlock){+.+...}, at: [<c0000000000947ac>] .kvmppc_core_vcpu_put_hv+0x2c/0xa0

and this task is already holding:
(&rq->lock){-.-.-.}, at: [<c000000000ac16c0>] .__schedule+0x180/0xaa0
which would create a new lock dependency:
(&rq->lock){-.-.-.} -> (&(&vcpu->arch.tbacct_lock)->rlock){+.+...}

but this new dependency connects a HARDIRQ-irq-safe lock:
(&rq->lock){-.-.-.}
... which became HARDIRQ-irq-safe at:
 [<c00000000013797c>] .lock_acquire+0xbc/0x190
 [<c000000000ac3c74>] ._raw_spin_lock+0x34/0x60
 [<c0000000000f8564>] .scheduler_tick+0x54/0x180
 [<c0000000000c2610>] .update_process_times+0x70/0xa0
 [<c00000000012cdfc>] .tick_periodic+0x3c/0xe0
 [<c00000000012cec8>] .tick_handle_periodic+0x28/0xb0
 [<c00000000001ef40>] .timer_interrupt+0x120/0x2e0
 [<c000000000002868>] decrementer_common+0x168/0x180
 [<c0000000001c7ca4>] .get_page_from_freelist+0x924/0xc10
 [<c0000000001c8e00>] .__alloc_pages_nodemask+0x200/0xba0
 [<c0000000001c9eb8>] .alloc_pages_exact_nid+0x68/0x110
 [<c000000000f4c3ec>] .page_cgroup_init+0x1e0/0x270
 [<c000000000f24480>] .start_kernel+0x3e0/0x4e4
 [<c000000000009d30>] .start_here_common+0x20/0x70

to a HARDIRQ-irq-unsafe lock:
(&(&vcpu->arch.tbacct_lock)->rlock){+.+...}
... which became HARDIRQ-irq-unsafe at:
...  [<c00000000013797c>] .lock_acquire+0xbc/0x190
 [<c000000000ac3c74>] ._raw_spin_lock+0x34/0x60
 [<c0000000000946ac>] .kvmppc_core_vcpu_load_hv+0x2c/0x100
 [<c00000000008394c>] .kvmppc_core_vcpu_load+0x2c/0x40
 [<c000000000081000>] .kvm_arch_vcpu_load+0x10/0x30
 [<c00000000007afd4>] .vcpu_load+0x64/0xd0
 [<c00000000007b0f8>] .kvm_vcpu_ioctl+0x68/0x730
 [<c00000000025530c>] .do_vfs_ioctl+0x4dc/0x7a0
 [<c000000000255694>] .SyS_ioctl+0xc4/0xe0
 [<c000000000009ee4>] syscall_exit+0x0/0x98

Some users have reported this deadlock occurring in practice, though
the reports have been primarily on 3.10.x-based kernels.

This fixes the problem by making tbacct_lock be irq-safe.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
arch/powerpc/kvm/book3s_hv.c

index 072287f1c3bc7347b6ff4f959843c97053fa4aa1..31d9cfb64a233808f9b474b550d2cf136e1c4aa8 100644 (file)
@@ -131,8 +131,9 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       unsigned long flags;
 
-       spin_lock(&vcpu->arch.tbacct_lock);
+       spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
            vc->preempt_tb != TB_NIL) {
                vc->stolen_tb += mftb() - vc->preempt_tb;
@@ -143,19 +144,20 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
                vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
                vcpu->arch.busy_preempt = TB_NIL;
        }
-       spin_unlock(&vcpu->arch.tbacct_lock);
+       spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
 }
 
 static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       unsigned long flags;
 
-       spin_lock(&vcpu->arch.tbacct_lock);
+       spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
                vc->preempt_tb = mftb();
        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
                vcpu->arch.busy_preempt = mftb();
-       spin_unlock(&vcpu->arch.tbacct_lock);
+       spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
 }
 
 static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
@@ -486,11 +488,11 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
         */
        if (vc->vcore_state != VCORE_INACTIVE &&
            vc->runner->arch.run_task != current) {
-               spin_lock(&vc->runner->arch.tbacct_lock);
+               spin_lock_irq(&vc->runner->arch.tbacct_lock);
                p = vc->stolen_tb;
                if (vc->preempt_tb != TB_NIL)
                        p += now - vc->preempt_tb;
-               spin_unlock(&vc->runner->arch.tbacct_lock);
+               spin_unlock_irq(&vc->runner->arch.tbacct_lock);
        } else {
                p = vc->stolen_tb;
        }
@@ -512,10 +514,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
        core_stolen = vcore_stolen_time(vc, now);
        stolen = core_stolen - vcpu->arch.stolen_logged;
        vcpu->arch.stolen_logged = core_stolen;
-       spin_lock(&vcpu->arch.tbacct_lock);
+       spin_lock_irq(&vcpu->arch.tbacct_lock);
        stolen += vcpu->arch.busy_stolen;
        vcpu->arch.busy_stolen = 0;
-       spin_unlock(&vcpu->arch.tbacct_lock);
+       spin_unlock_irq(&vcpu->arch.tbacct_lock);
        if (!dt || !vpa)
                return;
        memset(dt, 0, sizeof(struct dtl_entry));
@@ -1115,13 +1117,13 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 
        if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
                return;
-       spin_lock(&vcpu->arch.tbacct_lock);
+       spin_lock_irq(&vcpu->arch.tbacct_lock);
        now = mftb();
        vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
                vcpu->arch.stolen_logged;
        vcpu->arch.busy_preempt = now;
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
-       spin_unlock(&vcpu->arch.tbacct_lock);
+       spin_unlock_irq(&vcpu->arch.tbacct_lock);
        --vc->n_runnable;
        list_del(&vcpu->arch.run_list);
 }