KVM: PPC: Book3S: Add kernel emulation for the XICS interrupt controller
authorBenjamin Herrenschmidt <benh@kernel.crashing.org>
Wed, 17 Apr 2013 20:30:26 +0000 (20:30 +0000)
committerAlexander Graf <agraf@suse.de>
Fri, 26 Apr 2013 18:27:30 +0000 (20:27 +0200)
This adds in-kernel emulation of the XICS (eXternal Interrupt
Controller Specification) interrupt controller specified by PAPR, for
both HV and PR KVM guests.

The XICS emulation supports up to 1048560 interrupt sources.
Interrupt source numbers below 16 are reserved; 0 is used to mean no
interrupt and 2 is used for IPIs.  Internally these are represented in
blocks of 1024, called ICS (interrupt controller source) entities, but
that is not visible to userspace.

Each vcpu gets one ICP (interrupt controller presentation) entity,
used to store the per-vcpu state such as vcpu priority, pending
interrupt state, IPI request, etc.

This does not include any API or any way to connect vcpus to their
ICP state; that will be added in later patches.

This is based on an initial implementation by Michael Ellerman
<michael@ellerman.id.au> reworked by Benjamin Herrenschmidt and
Paul Mackerras.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
[agraf: fix typo, add dependency on !KVM_MPIC]
Signed-off-by: Alexander Graf <agraf@suse.de>
12 files changed:
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_pr_papr.c
arch/powerpc/kvm/book3s_rtas.c
arch/powerpc/kvm/book3s_xics.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_xics.h [new file with mode: 0644]
arch/powerpc/kvm/powerpc.c

index c55f7e6affaa2100fececb964dcfcc523f16adbe..349ed85c7d61e00dff7358e70933197704f9b9db 100644 (file)
@@ -142,6 +142,8 @@ extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+                                         unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
index 311f7e6f09e99b078a590f56ce1c4541e60cb14b..af326cde7cb62bf2f07c70e6d0992e154504d5e0 100644 (file)
@@ -192,6 +192,10 @@ struct kvmppc_linear_info {
        int              type;
 };
 
+/* XICS components, defined in book3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -264,6 +268,9 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_MPIC
        struct openpic *mpic;
 #endif
+#ifdef CONFIG_KVM_XICS
+       struct kvmppc_xics *xics;
+#endif
 };
 
 /*
@@ -387,6 +394,7 @@ struct kvmppc_booke_debug_reg {
 
 #define KVMPPC_IRQ_DEFAULT     0
 #define KVMPPC_IRQ_MPIC                1
+#define KVMPPC_IRQ_XICS                2
 
 struct openpic;
 
@@ -574,6 +582,9 @@ struct kvm_vcpu_arch {
        int irq_type;           /* one of KVM_IRQ_* */
        int irq_cpu_id;
        struct openpic *mpic;   /* KVM_IRQ_MPIC */
+#ifdef CONFIG_KVM_XICS
+       struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        struct kvm_vcpu_arch_shared shregs;
index 8a30eb7f2becf67e3adcc3f7ef2f78c97f79fd77..6582eed321bac0f8c1df4459f13c1ef31caf3943 100644 (file)
@@ -130,6 +130,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
                        struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                                struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -169,6 +170,10 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
+                               u32 priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+                               u32 *priority);
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -267,6 +272,30 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 
 static inline void kvm_linear_init(void)
 {}
+
+#endif
+
+#ifdef CONFIG_KVM_XICS
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
+}
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
+extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+#else
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+       { return 0; }
+static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
+                                        unsigned long server)
+       { return -EINVAL; }
+static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
+                                       struct kvm_irq_level *args)
+       { return -ENOTTY; }
+static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+       { return 0; }
 #endif
 
 static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
index 656e0bc29fe880eb713c04ed4dc4218bad44ef4f..eb643f8625796711f93fbad6d92022f957517df9 100644 (file)
@@ -163,6 +163,14 @@ config KVM_MPIC
           Currently, support is limited to certain versions of
           Freescale's MPIC implementation.
 
+config KVM_XICS
+       bool "KVM in-kernel XICS emulation"
+       depends on KVM_BOOK3S_64 && !KVM_MPIC
+       ---help---
+         Include support for the XICS (eXternal Interrupt Controller
+         Specification) interrupt controller architecture used on
+         IBM POWER (pSeries) servers.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index 3faf5c07329c98f9d9f68ec071eba07ad09be042..f9b87b540450e57cb87d645602965952ea366669 100644 (file)
@@ -79,6 +79,9 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
        book3s_hv_ras.o \
        book3s_hv_builtin.o
 
+kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
+       book3s_xics.o
+
 kvm-book3s_64-module-objs := \
        ../../../virt/kvm/kvm_main.o \
        ../../../virt/kvm/eventfd.o \
index 128ed3a856b93adb1bc8548a7b89f6a142554209..1a4d787df507f4bb5d21d2c8e11cf54fe989f50f 100644 (file)
@@ -104,7 +104,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
        return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
                                          unsigned int vec)
 {
        unsigned long old_pending = vcpu->arch.pending_exceptions;
index f3d7af7981c7d457945b102dc1892be398104a4a..82ba00f68b074392a886a9de191fd227297e97cb 100644 (file)
@@ -532,6 +532,15 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
                /* Send the error out to userspace via KVM_RUN */
                return rc;
+
+       case H_XIRR:
+       case H_CPPR:
+       case H_EOI:
+       case H_IPI:
+               if (kvmppc_xics_enabled(vcpu)) {
+                       ret = kvmppc_xics_hcall(vcpu, req);
+                       break;
+               } /* fallthrough */
        default:
                return RESUME_HOST;
        }
index 4efa4a4f3722a430f1cf95a46eee4fa8c5b09f90..b24309c6c2d507d3f2c008358c3c0728d450a7cd 100644 (file)
@@ -227,6 +227,13 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
        return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+       long rc = kvmppc_xics_hcall(vcpu, cmd);
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
        switch (cmd) {
@@ -246,6 +253,13 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                vcpu->stat.halt_wakeup++;
                return EMULATE_DONE;
+       case H_XIRR:
+       case H_CPPR:
+       case H_EOI:
+       case H_IPI:
+               if (kvmppc_xics_enabled(vcpu))
+                       return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+               break;
        case H_RTAS:
                if (list_empty(&vcpu->kvm->arch.rtas_tokens))
                        return RESUME_HOST;
index 6ad7050eb67d5a7485f443eb404ca42d13904585..77f9aa5f4ba578495ac8ffff562059865ed729ad 100644 (file)
 #include <asm/hvcall.h>
 #include <asm/rtas.h>
 
+#ifdef CONFIG_KVM_XICS
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq, server, priority;
+       int rc;
+
+       if (args->nargs != 3 || args->nret != 1) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+       server = args->args[1];
+       priority = args->args[2];
+
+       rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+       if (rc)
+               rc = -3;
+out:
+       args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq, server, priority;
+       int rc;
+
+       if (args->nargs != 1 || args->nret != 3) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+
+       server = priority = 0;
+       rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+       if (rc) {
+               rc = -3;
+               goto out;
+       }
+
+       args->rets[1] = server;
+       args->rets[2] = priority;
+out:
+       args->rets[0] = rc;
+}
+#endif /* CONFIG_KVM_XICS */
 
 struct rtas_handler {
        void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
        char *name;
 };
 
-static struct rtas_handler rtas_handlers[] = { };
+static struct rtas_handler rtas_handlers[] = {
+#ifdef CONFIG_KVM_XICS
+       { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+       { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+#endif
+};
 
 struct rtas_token_definition {
        struct list_head list;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644 (file)
index 0000000..53af848
--- /dev/null
@@ -0,0 +1,946 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#if 1
+#define XICS_DBG(fmt...) do { } while (0)
+#else
+#define XICS_DBG(fmt...) trace_printk(fmt)
+#endif
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ *
+ * - ioctl's to save/restore the entire state for snapshot & migration
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq);
+
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics;
+       u16 src;
+
+       XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics) {
+               XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+               return -EINVAL;
+       }
+       state = &ics->irq_state[src];
+       if (!state->exists)
+               return -EINVAL;
+
+       /*
+        * We set state->asserted locklessly. This should be fine as
+        * we are the only setter, thus concurrent access is undefined
+        * to begin with.
+        */
+       if (level == KVM_INTERRUPT_SET_LEVEL)
+               state->asserted = 1;
+       else if (level == KVM_INTERRUPT_UNSET) {
+               state->asserted = 0;
+               return 0;
+       }
+
+       /* Attempt delivery */
+       icp_deliver_irq(xics, NULL, irq);
+
+       return 0;
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+                            struct kvmppc_icp *icp)
+{
+       int i;
+
+       mutex_lock(&ics->lock);
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               struct ics_irq_state *state = &ics->irq_state[i];
+
+               if (!state->resend)
+                       continue;
+
+               XICS_DBG("resend %#x prio %#x\n", state->number,
+                             state->priority);
+
+               mutex_unlock(&ics->lock);
+               icp_deliver_irq(xics, icp, state->number);
+               mutex_lock(&ics->lock);
+       }
+
+       mutex_unlock(&ics->lock);
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_icp *icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+       bool deliver;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       icp = kvmppc_xics_find_server(kvm, server);
+       if (!icp)
+               return -EINVAL;
+
+       mutex_lock(&ics->lock);
+
+       XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+                irq, server, priority,
+                state->masked_pending, state->resend);
+
+       state->server = server;
+       state->priority = priority;
+       deliver = false;
+       if ((state->masked_pending || state->resend) && priority != MASKED) {
+               state->masked_pending = 0;
+               deliver = true;
+       }
+
+       mutex_unlock(&ics->lock);
+
+       if (deliver)
+               icp_deliver_irq(xics, icp, irq);
+
+       return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       mutex_lock(&ics->lock);
+       *server = state->server;
+       *priority = state->priority;
+       mutex_unlock(&ics->lock);
+
+       return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+                                 union kvmppc_icp_state old,
+                                 union kvmppc_icp_state new,
+                                 bool change_self)
+{
+       bool success;
+
+       /* Calculate new output value */
+       new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+       /* Attempt atomic update */
+       success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+       if (!success)
+               goto bail;
+
+       XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+                icp->server_num,
+                old.cppr, old.mfrr, old.pending_pri, old.xisr,
+                old.need_resend, old.out_ee);
+       XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+                new.cppr, new.mfrr, new.pending_pri, new.xisr,
+                new.need_resend, new.out_ee);
+       /*
+        * Check for output state update
+        *
+        * Note that this is racy since another processor could be updating
+        * the state already. This is why we never clear the interrupt output
+        * here, we only ever set it. The clear only happens prior to doing
+        * an update and only by the processor itself. Currently we do it
+        * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+        *
+        * We also do not try to figure out whether the EE state has changed,
+        * we unconditionally set it if the new state calls for it for the
+        * same reason.
+        */
+       if (new.out_ee) {
+               kvmppc_book3s_queue_irqprio(icp->vcpu,
+                                           BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+               if (!change_self)
+                       kvm_vcpu_kick(icp->vcpu);
+       }
+ bail:
+       return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+                            struct kvmppc_icp *icp)
+{
+       u32 icsid;
+
+       /* Order this load with the test for need_resend in the caller */
+       smp_rmb();
+       for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!test_and_clear_bit(icsid, icp->resend_map))
+                       continue;
+               if (!ics)
+                       continue;
+               ics_check_resend(xics, ics, icp);
+       }
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+                              u32 *reject)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool success;
+
+       XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+                icp->server_num);
+
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               *reject = 0;
+
+               /* See if we can deliver */
+               success = new_state.cppr > priority &&
+                       new_state.mfrr > priority &&
+                       new_state.pending_pri > priority;
+
+               /*
+                * If we can, check for a rejection and perform the
+                * delivery
+                */
+               if (success) {
+                       *reject = new_state.xisr;
+                       new_state.xisr = irq;
+                       new_state.pending_pri = priority;
+               } else {
+                       /*
+                        * If we failed to deliver we set need_resend
+                        * so a subsequent CPPR state change causes us
+                        * to try a new delivery.
+                        */
+                       new_state.need_resend = true;
+               }
+
+       } while (!icp_try_update(icp, old_state, new_state, false));
+
+       return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics;
+       u32 reject;
+       u16 src;
+
+       /*
+        * This is used both for initial delivery of an interrupt and
+        * for subsequent rejection.
+        *
+        * Rejection can be racy vs. resends. We have evaluated the
+        * rejection in an atomic ICP transaction which is now complete,
+        * so potentially the ICP can already accept the interrupt again.
+        *
+        * So we need to retry the delivery. Essentially the reject path
+        * boils down to a failed delivery. Always.
+        *
+        * Now the interrupt could also have moved to a different target,
+        * thus we may need to re-do the ICP lookup as well
+        */
+
+ again:
+       /* Get the ICS state and lock it */
+       ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+       if (!ics) {
+               XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+               return;
+       }
+       state = &ics->irq_state[src];
+
+       /* Get a lock on the ICS */
+       mutex_lock(&ics->lock);
+
+       /* Get our server */
+       if (!icp || state->server != icp->server_num) {
+               icp = kvmppc_xics_find_server(xics->kvm, state->server);
+               if (!icp) {
+                       pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n",
+                               new_irq, state->server);
+                       goto out;
+               }
+       }
+
+       /* Clear the resend bit of that interrupt */
+       state->resend = 0;
+
+       /*
+        * If masked, bail out
+        *
+        * Note: PAPR doesn't mention anything about masked pending
+        * when doing a resend, only when doing a delivery.
+        *
+        * However that would have the effect of losing a masked
+        * interrupt that was rejected and isn't consistent with
+        * the whole masked_pending business which is about not
+        * losing interrupts that occur while masked.
+        *
+        * I don't differenciate normal deliveries and resends, this
+        * implementation will differ from PAPR and not lose such
+        * interrupts.
+        */
+       if (state->priority == MASKED) {
+               XICS_DBG("irq %#x masked pending\n", new_irq);
+               state->masked_pending = 1;
+               goto out;
+       }
+
+       /*
+        * Try the delivery, this will set the need_resend flag
+        * in the ICP as part of the atomic transaction if the
+        * delivery is not possible.
+        *
+        * Note that if successful, the new delivery might have itself
+        * rejected an interrupt that was "delivered" before we took the
+        * icp mutex.
+        *
+        * In this case we do the whole sequence all over again for the
+        * new guy. We cannot assume that the rejected interrupt is less
+        * favored than the new one, and thus doesn't need to be delivered,
+        * because by the time we exit icp_try_to_deliver() the target
+        * processor may well have alrady consumed & completed it, and thus
+        * the rejected interrupt might actually be already acceptable.
+        */
+       if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+               /*
+                * Delivery was successful, did we reject somebody else ?
+                */
+               if (reject && reject != XICS_IPI) {
+                       mutex_unlock(&ics->lock);
+                       new_irq = reject;
+                       goto again;
+               }
+       } else {
+               /*
+                * We failed to deliver the interrupt we need to set the
+                * resend map bit and mark the ICS state as needing a resend
+                */
+               set_bit(ics->icsid, icp->resend_map);
+               state->resend = 1;
+
+               /*
+                * If the need_resend flag got cleared in the ICP some time
+                * between icp_try_to_deliver() atomic update and now, then
+                * we know it might have missed the resend_map bit. So we
+                * retry
+                */
+               smp_mb();
+               if (!icp->state.need_resend) {
+                       mutex_unlock(&ics->lock);
+                       goto again;
+               }
+       }
+ out:
+       mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                         u8 new_cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool resend;
+
+       /*
+        * This handles several related states in one operation:
+        *
+        * ICP State: Down_CPPR
+        *
+        * Load CPPR with new value and if the XISR is 0
+        * then check for resends:
+        *
+        * ICP State: Resend
+        *
+        * If MFRR is more favored than CPPR, check for IPIs
+        * and notify ICS of a potential resend. This is done
+        * asynchronously (when used in real mode, we will have
+        * to exit here).
+        *
+        * We do not handle the complete Check_IPI as documented
+        * here. In the PAPR, this state will be used for both
+        * Set_MFRR and Down_CPPR. However, we know that we aren't
+        * changing the MFRR state here so we don't need to handle
+        * the case of an MFRR causing a reject of a pending irq,
+        * this will have been handled when the MFRR was set in the
+        * first place.
+        *
+        * Thus we don't have to handle rejects, only resends.
+        *
+        * When implementing real mode for HV KVM, resend will lead to
+        * a H_TOO_HARD return and the whole transaction will be handled
+        * in virtual mode.
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Down_CPPR */
+               new_state.cppr = new_cppr;
+
+               /*
+                * Cut down Resend / Check_IPI / IPI
+                *
+                * The logic is that we cannot have a pending interrupt
+                * trumped by an IPI at this point (see above), so we
+                * know that either the pending interrupt is already an
+                * IPI (in which case we don't care to override it) or
+                * it's either more favored than us or non existent
+                */
+               if (new_state.mfrr < new_cppr &&
+                   new_state.mfrr <= new_state.pending_pri) {
+                       WARN_ON(new_state.xisr != XICS_IPI &&
+                               new_state.xisr != 0);
+                       new_state.pending_pri = new_state.mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               /* Latch/clear resend bit */
+               resend = new_state.need_resend;
+               new_state.need_resend = 0;
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       /*
+        * Now handle resend checks. Those are asynchronous to the ICP
+        * state update in HW (ie bus transactions) so we can handle them
+        * separately here too
+        */
+       if (resend)
+               icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 xirr;
+
+       /* First, remove EE from the processor */
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+       /*
+        * ICP State: Accept_Interrupt
+        *
+        * Return the pending interrupt (if any) along with the
+        * current CPPR, then clear the XISR & set CPPR to the
+        * pending priority
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+               if (!old_state.xisr)
+                       break;
+               new_state.cppr = new_state.pending_pri;
+               new_state.pending_pri = 0xff;
+               new_state.xisr = 0;
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+       return xirr;
+}
+
+static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+                         unsigned long mfrr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp;
+       u32 reject;
+       bool resend;
+       bool local;
+
+       XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+                vcpu->vcpu_id, server, mfrr);
+
+       icp = vcpu->arch.icp;
+       local = icp->server_num == server;
+       if (!local) {
+               icp = kvmppc_xics_find_server(vcpu->kvm, server);
+               if (!icp)
+                       return H_PARAMETER;
+       }
+
+       /*
+        * ICP state: Set_MFRR
+        *
+        * If the CPPR is more favored than the new MFRR, then
+        * nothing needs to be rejected as there can be no XISR to
+        * reject.  If the MFRR is being made less favored then
+        * there might be a previously-rejected interrupt needing
+        * to be resent.
+        *
+        * If the CPPR is less favored, then we might be replacing
+        * an interrupt, and thus need to possibly reject it as in
+        *
+        * ICP state: Check_IPI
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Set_MFRR */
+               new_state.mfrr = mfrr;
+
+               /* Check_IPI */
+               reject = 0;
+               resend = false;
+               if (mfrr < new_state.cppr) {
+                       /* Reject a pending interrupt if not an IPI */
+                       if (mfrr <= new_state.pending_pri)
+                               reject = new_state.xisr;
+                       new_state.pending_pri = mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+                       resend = new_state.need_resend;
+                       new_state.need_resend = 0;
+               }
+       } while (!icp_try_update(icp, old_state, new_state, local));
+
+       /* Handle reject */
+       if (reject && reject != XICS_IPI)
+               icp_deliver_irq(xics, icp, reject);
+
+       /* Handle resend */
+       if (resend)
+               icp_check_resend(xics, icp);
+
+       return H_SUCCESS;
+}
+
+static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 reject;
+
+       XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+       /*
+        * ICP State: Set_CPPR
+        *
+        * We can safely compare the new value with the current
+        * value outside of the transaction as the CPPR is only
+        * ever changed by the processor on itself
+        */
+       if (cppr > icp->state.cppr)
+               icp_down_cppr(xics, icp, cppr);
+       else if (cppr == icp->state.cppr)
+               return;
+
+       /*
+        * ICP State: Up_CPPR
+        *
+        * The processor is raising its priority, this can result
+        * in a rejection of a pending interrupt:
+        *
+        * ICP State: Reject_Current
+        *
+        * We can remove EE from the current processor, the update
+        * transaction will set it again if needed
+        */
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               reject = 0;
+               new_state.cppr = cppr;
+
+               if (cppr <= new_state.pending_pri) {
+                       reject = new_state.xisr;
+                       new_state.xisr = 0;
+                       new_state.pending_pri = 0xff;
+               }
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       /*
+        * Check for rejects. They are handled by doing a new delivery
+        * attempt (see comments in icp_deliver_irq).
+        */
+       if (reject && reject != XICS_IPI)
+               icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u32 irq = xirr & 0x00ffffff;
+       u16 src;
+
+       XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+       /*
+        * ICP State: EOI
+        *
+        * Note: If EOI is incorrectly used by SW to lower the CPPR
+        * value (ie more favored), we do not check for rejection of
+        * a pending interrupt, this is a SW error and PAPR sepcifies
+        * that we don't have to deal with it.
+        *
+        * The sending of an EOI to the ICS is handled after the
+        * CPPR update
+        *
+        * ICP State: Down_CPPR which we handle
+        * in a separate function as it's shared with H_CPPR.
+        */
+       icp_down_cppr(xics, icp, xirr >> 24);
+
+       /* IPIs have no EOI */
+       if (irq == XICS_IPI)
+               return H_SUCCESS;
+       /*
+        * EOI handling: If the interrupt is still asserted, we need to
+        * resend it. We can take a lockless "peek" at the ICS state here.
+        *
+        * "Message" interrupts will never have "asserted" set
+        */
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics) {
+               XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+               return H_PARAMETER;
+       }
+       state = &ics->irq_state[src];
+
+       /* Still asserted, resend it */
+       if (state->asserted)
+               icp_deliver_irq(xics, icp, irq);
+
+       return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+       unsigned long res;
+       int rc = H_SUCCESS;
+
+       /* Check if we have an ICP */
+       if (!vcpu->arch.icp || !vcpu->kvm->arch.xics)
+               return H_HARDWARE;
+
+       switch (req) {
+       case H_XIRR:
+               res = h_xirr(vcpu);
+               kvmppc_set_gpr(vcpu, 4, res);
+               break;
+       case H_CPPR:
+               h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_EOI:
+               rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_IPI:
+               rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+                          kvmppc_get_gpr(vcpu, 5));
+               break;
+       }
+
+       return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+       struct kvmppc_xics *xics = m->private;
+       struct kvm *kvm = xics->kvm;
+       struct kvm_vcpu *vcpu;
+       int icsid, i;
+
+       if (!kvm)
+               return 0;
+
+       seq_printf(m, "=========\nICP state\n=========\n");
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               struct kvmppc_icp *icp = vcpu->arch.icp;
+               union kvmppc_icp_state state;
+
+               if (!icp)
+                       continue;
+
+               state.raw = ACCESS_ONCE(icp->state.raw);
+               seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n",
+                          icp->server_num, state.xisr,
+                          state.pending_pri, state.cppr, state.mfrr,
+                          state.out_ee, state.need_resend);
+       }
+
+       for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!ics)
+                       continue;
+
+               seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+                          icsid);
+
+               mutex_lock(&ics->lock);
+
+               for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+                       struct ics_irq_state *irq = &ics->irq_state[i];
+
+                       seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+                                  irq->number, irq->server, irq->priority,
+                                  irq->saved_priority, irq->asserted,
+                                  irq->resend, irq->masked_pending);
+
+               }
+               mutex_unlock(&ics->lock);
+       }
+       return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+       .open = xics_debug_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+       char *name;
+
+       name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+       if (!name) {
+               pr_err("%s: no memory for name\n", __func__);
+               return;
+       }
+
+       xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+                                          xics, &xics_debug_fops);
+
+       pr_debug("%s: created %s\n", __func__, name);
+       kfree(name);
+}
+
+struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+                                         struct kvmppc_xics *xics, int irq)
+{
+       struct kvmppc_ics *ics;
+       int i, icsid;
+
+       icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+       mutex_lock(&kvm->lock);
+
+       /* ICS already exists - somebody else got here first */
+       if (xics->ics[icsid])
+               goto out;
+
+       /* Create the ICS */
+       ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+       if (!ics)
+               goto out;
+
+       mutex_init(&ics->lock);
+       ics->icsid = icsid;
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+               ics->irq_state[i].priority = MASKED;
+               ics->irq_state[i].saved_priority = MASKED;
+       }
+       smp_wmb();
+       xics->ics[icsid] = ics;
+
+       if (icsid > xics->max_icsid)
+               xics->max_icsid = icsid;
+
+ out:
+       mutex_unlock(&kvm->lock);
+       return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+{
+       struct kvmppc_icp *icp;
+
+       if (!vcpu->kvm->arch.xics)
+               return -ENODEV;
+
+       if (kvmppc_xics_find_server(vcpu->kvm, server_num))
+               return -EEXIST;
+
+       icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+       if (!icp)
+               return -ENOMEM;
+
+       icp->vcpu = vcpu;
+       icp->server_num = server_num;
+       icp->state.mfrr = MASKED;
+       icp->state.pending_pri = MASKED;
+       vcpu->arch.icp = icp;
+
+       XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+       return 0;
+}
+
+/* -- ioctls -- */
+
+int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
+{
+       struct kvmppc_xics *xics;
+       int r;
+
+       /* locking against multiple callers? */
+
+       xics = kvm->arch.xics;
+       if (!xics)
+               return -ENODEV;
+
+       switch (args->level) {
+       case KVM_INTERRUPT_SET:
+       case KVM_INTERRUPT_SET_LEVEL:
+       case KVM_INTERRUPT_UNSET:
+               r = ics_deliver_irq(xics, args->irq, args->level);
+               break;
+       default:
+               r = -EINVAL;
+       }
+
+       return r;
+}
+
+void kvmppc_xics_free(struct kvmppc_xics *xics)
+{
+       int i;
+       struct kvm *kvm = xics->kvm;
+
+       debugfs_remove(xics->dentry);
+
+       if (kvm)
+               kvm->arch.xics = NULL;
+
+       for (i = 0; i <= xics->max_icsid; i++)
+               kfree(xics->ics[i]);
+       kfree(xics);
+}
+
+int kvm_xics_create(struct kvm *kvm, u32 type)
+{
+       struct kvmppc_xics *xics;
+       int ret = 0;
+
+       xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+       if (!xics)
+               return -ENOMEM;
+
+       xics->kvm = kvm;
+
+       /* Already there ? */
+       mutex_lock(&kvm->lock);
+       if (kvm->arch.xics)
+               ret = -EEXIST;
+       else
+               kvm->arch.xics = xics;
+       mutex_unlock(&kvm->lock);
+
+       if (ret)
+               return ret;
+
+       xics_debugfs_init(xics);
+
+       return 0;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->arch.icp)
+               return;
+       kfree(vcpu->arch.icp);
+       vcpu->arch.icp = NULL;
+       vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644 (file)
index 0000000..58ee190
--- /dev/null
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID 1023
+#define KVMPPC_XICS_ICS_SHIFT  10
+#define KVMPPC_XICS_IRQ_PER_ICS        (1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK   (KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ  16
+#define KVMPPC_XICS_NR_IRQS    ((KVMPPC_XICS_MAX_ICS_ID + 1) * \
+                                KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED 0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+       u32 number;
+       u32 server;
+       u8  priority;
+       u8  saved_priority; /* currently unused */
+       u8  resend;
+       u8  masked_pending;
+       u8  asserted; /* Only for LSI */
+       u8  exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+       unsigned long raw;
+       struct {
+               u8 out_ee:1;
+               u8 need_resend:1;
+               u8 cppr;
+               u8 mfrr;
+               u8 pending_pri;
+               u32 xisr;
+       };
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE    (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+       struct kvm_vcpu *vcpu;
+       unsigned long server_num;
+       union kvmppc_icp_state state;
+       unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+};
+
+struct kvmppc_ics {
+       struct mutex lock;
+       u16 icsid;
+       struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+       struct kvm *kvm;
+       struct dentry *dentry;
+       u32 max_icsid;
+       struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+                                                        u32 nr)
+{
+       struct kvm_vcpu *vcpu = NULL;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
+                       return vcpu->arch.icp;
+       }
+       return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+                                                     u32 irq, u16 *source)
+{
+       u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+       u16 src = irq & KVMPPC_XICS_SRC_MASK;
+       struct kvmppc_ics *ics;
+
+       if (source)
+               *source = src;
+       if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+               return NULL;
+       ics = xics->ics[icsid];
+       if (!ics)
+               return NULL;
+       return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
index d4fd443ae7bdc4bc1d5bc20588e8fbe6410e5c89..31084c6335c9e1d16ab1294ea62f99948fcf63be 100644 (file)
@@ -471,6 +471,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
        case KVMPPC_IRQ_MPIC:
                kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
                break;
+       case KVMPPC_IRQ_XICS:
+               kvmppc_xics_free_icp(vcpu);
+               break;
        }
 
        kvmppc_core_vcpu_free(vcpu);