KVM: PPC: Book3s HV: Maintain separate guest and host views of R and C bits
authorPaul Mackerras <paulus@samba.org>
Thu, 15 Dec 2011 02:02:02 +0000 (02:02 +0000)
committerAvi Kivity <avi@redhat.com>
Mon, 5 Mar 2012 12:52:39 +0000 (14:52 +0200)
This allows both the guest and the host to use the referenced (R) and
changed (C) bits in the guest hashed page table.  The guest has a view
of R and C that is maintained in the guest_rpte field of the revmap
entry for the HPTE, and the host has a view that is maintained in the
rmap entry for the associated gfn.

Both view are updated from the guest HPT.  If a bit (R or C) is zero
in either view, it will be initially set to zero in the HPTE (or HPTEs),
until set to 1 by hardware.  When an HPTE is removed for any reason,
the R and C bits from the HPTE are ORed into both views.  We have to
be careful to read the R and C bits from the HPTE after invalidating
it, but before unlocking it, in case of any late updates by the hardware.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c

index 968f3aa61cd1bb872d4bb391eeafef6eabe6caab..1cb6e522485bf20e3997e4c3a3995ac9d789b277 100644 (file)
@@ -200,8 +200,9 @@ struct revmap_entry {
  * index in the guest HPT of a HPTE that points to the page.
  */
 #define KVMPPC_RMAP_LOCK_BIT   63
-#define KVMPPC_RMAP_REF_BIT    33
-#define KVMPPC_RMAP_REFERENCED (1ul << KVMPPC_RMAP_REF_BIT)
+#define KVMPPC_RMAP_RC_SHIFT   32
+#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHANGED    (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_PRESENT    0x100000000ul
 #define KVMPPC_RMAP_INDEX      0xfffffffful
 
index 66d6452c1081fa2badbbf37b19332d765d0e53d8..aa51ddef468e9d9fe582fdcb05ec420e0d60eef5 100644 (file)
@@ -505,6 +505,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        unsigned long is_io;
        unsigned int writing, write_ok;
        struct vm_area_struct *vma;
+       unsigned long rcbits;
 
        /*
         * Real-mode code has already searched the HPT and found the
@@ -640,11 +641,17 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                goto out_unlock;
        }
 
+       /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
+       rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+       r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
+
        if (hptep[0] & HPTE_V_VALID) {
                /* HPTE was previously valid, so we need to invalidate it */
                unlock_rmap(rmap);
                hptep[0] |= HPTE_V_ABSENT;
                kvmppc_invalidate_hpte(kvm, hptep, index);
+               /* don't lose previous R and C bits */
+               r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
        } else {
                kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
        }
@@ -701,50 +708,55 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
        struct revmap_entry *rev = kvm->arch.revmap;
        unsigned long h, i, j;
        unsigned long *hptep;
-       unsigned long ptel, psize;
+       unsigned long ptel, psize, rcbits;
 
        for (;;) {
-               while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
-                       cpu_relax();
+               lock_rmap(rmapp);
                if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
-                       __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+                       unlock_rmap(rmapp);
                        break;
                }
 
                /*
                 * To avoid an ABBA deadlock with the HPTE lock bit,
-                * we have to unlock the rmap chain before locking the HPTE.
-                * Thus we remove the first entry, unlock the rmap chain,
-                * lock the HPTE and then check that it is for the
-                * page we're unmapping before changing it to non-present.
+                * we can't spin on the HPTE lock while holding the
+                * rmap chain lock.
                 */
                i = *rmapp & KVMPPC_RMAP_INDEX;
+               hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+               if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+                       /* unlock rmap before spinning on the HPTE lock */
+                       unlock_rmap(rmapp);
+                       while (hptep[0] & HPTE_V_HVLOCK)
+                               cpu_relax();
+                       continue;
+               }
                j = rev[i].forw;
                if (j == i) {
                        /* chain is now empty */
-                       j = 0;
+                       *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
                } else {
                        /* remove i from chain */
                        h = rev[i].back;
                        rev[h].forw = j;
                        rev[j].back = h;
                        rev[i].forw = rev[i].back = i;
-                       j |= KVMPPC_RMAP_PRESENT;
+                       *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
                }
-               smp_wmb();
-               *rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT);
 
-               /* Now lock, check and modify the HPTE */
-               hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
-               while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
-                       cpu_relax();
+               /* Now check and modify the HPTE */
                ptel = rev[i].guest_rpte;
                psize = hpte_page_size(hptep[0], ptel);
                if ((hptep[0] & HPTE_V_VALID) &&
                    hpte_rpn(ptel, psize) == gfn) {
-                       kvmppc_invalidate_hpte(kvm, hptep, i);
                        hptep[0] |= HPTE_V_ABSENT;
+                       kvmppc_invalidate_hpte(kvm, hptep, i);
+                       /* Harvest R and C */
+                       rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
+                       *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+                       rev[i].guest_rpte = ptel | rcbits;
                }
+               unlock_rmap(rmapp);
                hptep[0] &= ~HPTE_V_HVLOCK;
        }
        return 0;
@@ -767,7 +779,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
        kvm_unmap_rmapp(kvm, rmapp, gfn);
        while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
                cpu_relax();
-       __clear_bit(KVMPPC_RMAP_REF_BIT, rmapp);
+       *rmapp &= ~KVMPPC_RMAP_REFERENCED;
        __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
        return 1;
 }
index ba4a1376b3311a6983b0bac6684dca9c5dc5be18..91b45a03f4388998544550de21fc2b3dc9fb86b5 100644 (file)
@@ -87,15 +87,17 @@ EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
-                               unsigned long hpte_v)
+                               struct revmap_entry *rev,
+                               unsigned long hpte_v, unsigned long hpte_r)
 {
-       struct revmap_entry *rev, *next, *prev;
+       struct revmap_entry *next, *prev;
        unsigned long gfn, ptel, head;
        struct kvm_memory_slot *memslot;
        unsigned long *rmap;
+       unsigned long rcbits;
 
-       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
-       ptel = rev->guest_rpte;
+       rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
+       ptel = rev->guest_rpte |= rcbits;
        gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
        memslot = builtin_gfn_to_memslot(kvm, gfn);
        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
@@ -116,6 +118,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                else
                        *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
        }
+       *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
        unlock_rmap(rmap);
 }
 
@@ -162,6 +165,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
        pte_t pte;
        unsigned int writing;
        unsigned long mmu_seq;
+       unsigned long rcbits;
        bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
 
        psize = hpte_page_size(pteh, ptel);
@@ -320,6 +324,9 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
                } else {
                        kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
                                                realmode);
+                       /* Only set R/C in real HPTE if already set in *rmap */
+                       rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+                       ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
                }
        }
 
@@ -394,7 +401,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
                        asm volatile("tlbiel %0" : : "r" (rb));
                        asm volatile("ptesync" : : : "memory");
                }
-               remove_revmap_chain(kvm, pte_index, v);
+               /* Read PTE low word after tlbie to get final R/C values */
+               remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
        }
        r = rev->guest_rpte;
        unlock_hpte(hpte, 0);
@@ -469,12 +477,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
                        args[j] = ((0x80 | flags) << 56) + pte_index;
                        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
-                       /* insert R and C bits from guest PTE */
-                       rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
-                       args[j] |= rcbits << (56 - 5);
 
-                       if (!(hp[0] & HPTE_V_VALID))
+                       if (!(hp[0] & HPTE_V_VALID)) {
+                               /* insert R and C bits from PTE */
+                               rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+                               args[j] |= rcbits << (56 - 5);
                                continue;
+                       }
 
                        hp[0] &= ~HPTE_V_VALID;         /* leave it locked */
                        tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
@@ -505,13 +514,16 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
                        asm volatile("ptesync" : : : "memory");
                }
 
+               /* Read PTE low words after tlbie to get final R/C values */
                for (k = 0; k < n; ++k) {
                        j = indexes[k];
                        pte_index = args[j] & ((1ul << 56) - 1);
                        hp = hptes[k];
                        rev = revs[k];
-                       remove_revmap_chain(kvm, pte_index, hp[0]);
-                       unlock_hpte(hp, 0);
+                       remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
+                       rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+                       args[j] |= rcbits << (56 - 5);
+                       hp[0] = 0;
                }
        }
 
@@ -595,8 +607,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
                pte_index &= ~3;
                n = 4;
        }
-       if (flags & H_R_XLATE)
-               rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
        for (i = 0; i < n; ++i, ++pte_index) {
                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
                v = hpte[0] & ~HPTE_V_HVLOCK;
@@ -605,12 +616,8 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
                        v &= ~HPTE_V_ABSENT;
                        v |= HPTE_V_VALID;
                }
-               if (v & HPTE_V_VALID) {
-                       if (rev)
-                               r = rev[i].guest_rpte;
-                       else
-                               r = hpte[1] | HPTE_R_RPN;
-               }
+               if (v & HPTE_V_VALID)
+                       r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
                vcpu->arch.gpr[4 + i * 2] = v;
                vcpu->arch.gpr[5 + i * 2] = r;
        }