Merge branch 'master' into for_paulus
[linux-drm-fsl-dcu.git] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <asm/processor.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
26 #include <asm/msr.h>
27 #include <linux/mm.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <asm/uaccess.h>
31 #include <linux/reboot.h>
32 #include <asm/io.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
36 #include <asm/desc.h>
37 #include <linux/sysdev.h>
38 #include <linux/cpu.h>
39
40 #include "x86_emulate.h"
41 #include "segment_descriptor.h"
42
43 MODULE_AUTHOR("Qumranet");
44 MODULE_LICENSE("GPL");
45
46 static DEFINE_SPINLOCK(kvm_lock);
47 static LIST_HEAD(vm_list);
48
49 struct kvm_arch_ops *kvm_arch_ops;
50 struct kvm_stat kvm_stat;
51 EXPORT_SYMBOL_GPL(kvm_stat);
52
53 static struct kvm_stats_debugfs_item {
54         const char *name;
55         u32 *data;
56         struct dentry *dentry;
57 } debugfs_entries[] = {
58         { "pf_fixed", &kvm_stat.pf_fixed },
59         { "pf_guest", &kvm_stat.pf_guest },
60         { "tlb_flush", &kvm_stat.tlb_flush },
61         { "invlpg", &kvm_stat.invlpg },
62         { "exits", &kvm_stat.exits },
63         { "io_exits", &kvm_stat.io_exits },
64         { "mmio_exits", &kvm_stat.mmio_exits },
65         { "signal_exits", &kvm_stat.signal_exits },
66         { "irq_window", &kvm_stat.irq_window_exits },
67         { "halt_exits", &kvm_stat.halt_exits },
68         { "request_irq", &kvm_stat.request_irq_exits },
69         { "irq_exits", &kvm_stat.irq_exits },
70         { NULL, NULL }
71 };
72
73 static struct dentry *debugfs_dir;
74
75 #define MAX_IO_MSRS 256
76
77 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
78 #define LMSW_GUEST_MASK 0x0eULL
79 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
80 #define CR8_RESEVED_BITS (~0x0fULL)
81 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
82
83 #ifdef CONFIG_X86_64
84 // LDT or TSS descriptor in the GDT. 16 bytes.
85 struct segment_descriptor_64 {
86         struct segment_descriptor s;
87         u32 base_higher;
88         u32 pad_zero;
89 };
90
91 #endif
92
93 unsigned long segment_base(u16 selector)
94 {
95         struct descriptor_table gdt;
96         struct segment_descriptor *d;
97         unsigned long table_base;
98         typedef unsigned long ul;
99         unsigned long v;
100
101         if (selector == 0)
102                 return 0;
103
104         asm ("sgdt %0" : "=m"(gdt));
105         table_base = gdt.base;
106
107         if (selector & 4) {           /* from ldt */
108                 u16 ldt_selector;
109
110                 asm ("sldt %0" : "=g"(ldt_selector));
111                 table_base = segment_base(ldt_selector);
112         }
113         d = (struct segment_descriptor *)(table_base + (selector & ~7));
114         v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
115 #ifdef CONFIG_X86_64
116         if (d->system == 0
117             && (d->type == 2 || d->type == 9 || d->type == 11))
118                 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
119 #endif
120         return v;
121 }
122 EXPORT_SYMBOL_GPL(segment_base);
123
124 static inline int valid_vcpu(int n)
125 {
126         return likely(n >= 0 && n < KVM_MAX_VCPUS);
127 }
128
129 int kvm_read_guest(struct kvm_vcpu *vcpu,
130                              gva_t addr,
131                              unsigned long size,
132                              void *dest)
133 {
134         unsigned char *host_buf = dest;
135         unsigned long req_size = size;
136
137         while (size) {
138                 hpa_t paddr;
139                 unsigned now;
140                 unsigned offset;
141                 hva_t guest_buf;
142
143                 paddr = gva_to_hpa(vcpu, addr);
144
145                 if (is_error_hpa(paddr))
146                         break;
147
148                 guest_buf = (hva_t)kmap_atomic(
149                                         pfn_to_page(paddr >> PAGE_SHIFT),
150                                         KM_USER0);
151                 offset = addr & ~PAGE_MASK;
152                 guest_buf |= offset;
153                 now = min(size, PAGE_SIZE - offset);
154                 memcpy(host_buf, (void*)guest_buf, now);
155                 host_buf += now;
156                 addr += now;
157                 size -= now;
158                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
159         }
160         return req_size - size;
161 }
162 EXPORT_SYMBOL_GPL(kvm_read_guest);
163
164 int kvm_write_guest(struct kvm_vcpu *vcpu,
165                              gva_t addr,
166                              unsigned long size,
167                              void *data)
168 {
169         unsigned char *host_buf = data;
170         unsigned long req_size = size;
171
172         while (size) {
173                 hpa_t paddr;
174                 unsigned now;
175                 unsigned offset;
176                 hva_t guest_buf;
177
178                 paddr = gva_to_hpa(vcpu, addr);
179
180                 if (is_error_hpa(paddr))
181                         break;
182
183                 guest_buf = (hva_t)kmap_atomic(
184                                 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
185                 offset = addr & ~PAGE_MASK;
186                 guest_buf |= offset;
187                 now = min(size, PAGE_SIZE - offset);
188                 memcpy((void*)guest_buf, host_buf, now);
189                 host_buf += now;
190                 addr += now;
191                 size -= now;
192                 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
193         }
194         return req_size - size;
195 }
196 EXPORT_SYMBOL_GPL(kvm_write_guest);
197
198 static int vcpu_slot(struct kvm_vcpu *vcpu)
199 {
200         return vcpu - vcpu->kvm->vcpus;
201 }
202
203 /*
204  * Switches to specified vcpu, until a matching vcpu_put()
205  */
206 static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot)
207 {
208         struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot];
209
210         mutex_lock(&vcpu->mutex);
211         if (unlikely(!vcpu->vmcs)) {
212                 mutex_unlock(&vcpu->mutex);
213                 return NULL;
214         }
215         return kvm_arch_ops->vcpu_load(vcpu);
216 }
217
218 static void vcpu_put(struct kvm_vcpu *vcpu)
219 {
220         kvm_arch_ops->vcpu_put(vcpu);
221         mutex_unlock(&vcpu->mutex);
222 }
223
224 static int kvm_dev_open(struct inode *inode, struct file *filp)
225 {
226         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
227         int i;
228
229         if (!kvm)
230                 return -ENOMEM;
231
232         spin_lock_init(&kvm->lock);
233         INIT_LIST_HEAD(&kvm->active_mmu_pages);
234         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
235                 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
236
237                 mutex_init(&vcpu->mutex);
238                 vcpu->cpu = -1;
239                 vcpu->kvm = kvm;
240                 vcpu->mmu.root_hpa = INVALID_PAGE;
241                 INIT_LIST_HEAD(&vcpu->free_pages);
242                 spin_lock(&kvm_lock);
243                 list_add(&kvm->vm_list, &vm_list);
244                 spin_unlock(&kvm_lock);
245         }
246         filp->private_data = kvm;
247         return 0;
248 }
249
250 /*
251  * Free any memory in @free but not in @dont.
252  */
253 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
254                                   struct kvm_memory_slot *dont)
255 {
256         int i;
257
258         if (!dont || free->phys_mem != dont->phys_mem)
259                 if (free->phys_mem) {
260                         for (i = 0; i < free->npages; ++i)
261                                 if (free->phys_mem[i])
262                                         __free_page(free->phys_mem[i]);
263                         vfree(free->phys_mem);
264                 }
265
266         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
267                 vfree(free->dirty_bitmap);
268
269         free->phys_mem = NULL;
270         free->npages = 0;
271         free->dirty_bitmap = NULL;
272 }
273
274 static void kvm_free_physmem(struct kvm *kvm)
275 {
276         int i;
277
278         for (i = 0; i < kvm->nmemslots; ++i)
279                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
280 }
281
282 static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
283 {
284         if (!vcpu_load(vcpu->kvm, vcpu_slot(vcpu)))
285                 return;
286
287         kvm_mmu_destroy(vcpu);
288         vcpu_put(vcpu);
289         kvm_arch_ops->vcpu_free(vcpu);
290 }
291
292 static void kvm_free_vcpus(struct kvm *kvm)
293 {
294         unsigned int i;
295
296         for (i = 0; i < KVM_MAX_VCPUS; ++i)
297                 kvm_free_vcpu(&kvm->vcpus[i]);
298 }
299
300 static int kvm_dev_release(struct inode *inode, struct file *filp)
301 {
302         struct kvm *kvm = filp->private_data;
303
304         spin_lock(&kvm_lock);
305         list_del(&kvm->vm_list);
306         spin_unlock(&kvm_lock);
307         kvm_free_vcpus(kvm);
308         kvm_free_physmem(kvm);
309         kfree(kvm);
310         return 0;
311 }
312
313 static void inject_gp(struct kvm_vcpu *vcpu)
314 {
315         kvm_arch_ops->inject_gp(vcpu, 0);
316 }
317
318 /*
319  * Load the pae pdptrs.  Return true is they are all valid.
320  */
321 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
322 {
323         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
324         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
325         int i;
326         u64 pdpte;
327         u64 *pdpt;
328         int ret;
329         struct kvm_memory_slot *memslot;
330
331         spin_lock(&vcpu->kvm->lock);
332         memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn);
333         /* FIXME: !memslot - emulate? 0xff? */
334         pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
335
336         ret = 1;
337         for (i = 0; i < 4; ++i) {
338                 pdpte = pdpt[offset + i];
339                 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
340                         ret = 0;
341                         goto out;
342                 }
343         }
344
345         for (i = 0; i < 4; ++i)
346                 vcpu->pdptrs[i] = pdpt[offset + i];
347
348 out:
349         kunmap_atomic(pdpt, KM_USER0);
350         spin_unlock(&vcpu->kvm->lock);
351
352         return ret;
353 }
354
355 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
356 {
357         if (cr0 & CR0_RESEVED_BITS) {
358                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
359                        cr0, vcpu->cr0);
360                 inject_gp(vcpu);
361                 return;
362         }
363
364         if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
365                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
366                 inject_gp(vcpu);
367                 return;
368         }
369
370         if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
371                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
372                        "and a clear PE flag\n");
373                 inject_gp(vcpu);
374                 return;
375         }
376
377         if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
378 #ifdef CONFIG_X86_64
379                 if ((vcpu->shadow_efer & EFER_LME)) {
380                         int cs_db, cs_l;
381
382                         if (!is_pae(vcpu)) {
383                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
384                                        "in long mode while PAE is disabled\n");
385                                 inject_gp(vcpu);
386                                 return;
387                         }
388                         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
389                         if (cs_l) {
390                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
391                                        "in long mode while CS.L == 1\n");
392                                 inject_gp(vcpu);
393                                 return;
394
395                         }
396                 } else
397 #endif
398                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
399                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
400                                "reserved bits\n");
401                         inject_gp(vcpu);
402                         return;
403                 }
404
405         }
406
407         kvm_arch_ops->set_cr0(vcpu, cr0);
408         vcpu->cr0 = cr0;
409
410         spin_lock(&vcpu->kvm->lock);
411         kvm_mmu_reset_context(vcpu);
412         spin_unlock(&vcpu->kvm->lock);
413         return;
414 }
415 EXPORT_SYMBOL_GPL(set_cr0);
416
417 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
418 {
419         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
420         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
421 }
422 EXPORT_SYMBOL_GPL(lmsw);
423
424 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
425 {
426         if (cr4 & CR4_RESEVED_BITS) {
427                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
428                 inject_gp(vcpu);
429                 return;
430         }
431
432         if (is_long_mode(vcpu)) {
433                 if (!(cr4 & CR4_PAE_MASK)) {
434                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
435                                "in long mode\n");
436                         inject_gp(vcpu);
437                         return;
438                 }
439         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
440                    && !load_pdptrs(vcpu, vcpu->cr3)) {
441                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
442                 inject_gp(vcpu);
443         }
444
445         if (cr4 & CR4_VMXE_MASK) {
446                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
447                 inject_gp(vcpu);
448                 return;
449         }
450         kvm_arch_ops->set_cr4(vcpu, cr4);
451         spin_lock(&vcpu->kvm->lock);
452         kvm_mmu_reset_context(vcpu);
453         spin_unlock(&vcpu->kvm->lock);
454 }
455 EXPORT_SYMBOL_GPL(set_cr4);
456
457 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
458 {
459         if (is_long_mode(vcpu)) {
460                 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
461                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
462                         inject_gp(vcpu);
463                         return;
464                 }
465         } else {
466                 if (cr3 & CR3_RESEVED_BITS) {
467                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
468                         inject_gp(vcpu);
469                         return;
470                 }
471                 if (is_paging(vcpu) && is_pae(vcpu) &&
472                     !load_pdptrs(vcpu, cr3)) {
473                         printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
474                                "reserved bits\n");
475                         inject_gp(vcpu);
476                         return;
477                 }
478         }
479
480         vcpu->cr3 = cr3;
481         spin_lock(&vcpu->kvm->lock);
482         /*
483          * Does the new cr3 value map to physical memory? (Note, we
484          * catch an invalid cr3 even in real-mode, because it would
485          * cause trouble later on when we turn on paging anyway.)
486          *
487          * A real CPU would silently accept an invalid cr3 and would
488          * attempt to use it - with largely undefined (and often hard
489          * to debug) behavior on the guest side.
490          */
491         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
492                 inject_gp(vcpu);
493         else
494                 vcpu->mmu.new_cr3(vcpu);
495         spin_unlock(&vcpu->kvm->lock);
496 }
497 EXPORT_SYMBOL_GPL(set_cr3);
498
499 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
500 {
501         if ( cr8 & CR8_RESEVED_BITS) {
502                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
503                 inject_gp(vcpu);
504                 return;
505         }
506         vcpu->cr8 = cr8;
507 }
508 EXPORT_SYMBOL_GPL(set_cr8);
509
510 void fx_init(struct kvm_vcpu *vcpu)
511 {
512         struct __attribute__ ((__packed__)) fx_image_s {
513                 u16 control; //fcw
514                 u16 status; //fsw
515                 u16 tag; // ftw
516                 u16 opcode; //fop
517                 u64 ip; // fpu ip
518                 u64 operand;// fpu dp
519                 u32 mxcsr;
520                 u32 mxcsr_mask;
521
522         } *fx_image;
523
524         fx_save(vcpu->host_fx_image);
525         fpu_init();
526         fx_save(vcpu->guest_fx_image);
527         fx_restore(vcpu->host_fx_image);
528
529         fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
530         fx_image->mxcsr = 0x1f80;
531         memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
532                0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
533 }
534 EXPORT_SYMBOL_GPL(fx_init);
535
536 /*
537  * Creates some virtual cpus.  Good luck creating more than one.
538  */
539 static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
540 {
541         int r;
542         struct kvm_vcpu *vcpu;
543
544         r = -EINVAL;
545         if (!valid_vcpu(n))
546                 goto out;
547
548         vcpu = &kvm->vcpus[n];
549
550         mutex_lock(&vcpu->mutex);
551
552         if (vcpu->vmcs) {
553                 mutex_unlock(&vcpu->mutex);
554                 return -EEXIST;
555         }
556
557         vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
558                                            FX_IMAGE_ALIGN);
559         vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
560
561         r = kvm_arch_ops->vcpu_create(vcpu);
562         if (r < 0)
563                 goto out_free_vcpus;
564
565         r = kvm_mmu_create(vcpu);
566         if (r < 0)
567                 goto out_free_vcpus;
568
569         kvm_arch_ops->vcpu_load(vcpu);
570         r = kvm_mmu_setup(vcpu);
571         if (r >= 0)
572                 r = kvm_arch_ops->vcpu_setup(vcpu);
573         vcpu_put(vcpu);
574
575         if (r < 0)
576                 goto out_free_vcpus;
577
578         return 0;
579
580 out_free_vcpus:
581         kvm_free_vcpu(vcpu);
582         mutex_unlock(&vcpu->mutex);
583 out:
584         return r;
585 }
586
587 /*
588  * Allocate some memory and give it an address in the guest physical address
589  * space.
590  *
591  * Discontiguous memory is allowed, mostly for framebuffers.
592  */
593 static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm,
594                                            struct kvm_memory_region *mem)
595 {
596         int r;
597         gfn_t base_gfn;
598         unsigned long npages;
599         unsigned long i;
600         struct kvm_memory_slot *memslot;
601         struct kvm_memory_slot old, new;
602         int memory_config_version;
603
604         r = -EINVAL;
605         /* General sanity checks */
606         if (mem->memory_size & (PAGE_SIZE - 1))
607                 goto out;
608         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
609                 goto out;
610         if (mem->slot >= KVM_MEMORY_SLOTS)
611                 goto out;
612         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
613                 goto out;
614
615         memslot = &kvm->memslots[mem->slot];
616         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
617         npages = mem->memory_size >> PAGE_SHIFT;
618
619         if (!npages)
620                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
621
622 raced:
623         spin_lock(&kvm->lock);
624
625         memory_config_version = kvm->memory_config_version;
626         new = old = *memslot;
627
628         new.base_gfn = base_gfn;
629         new.npages = npages;
630         new.flags = mem->flags;
631
632         /* Disallow changing a memory slot's size. */
633         r = -EINVAL;
634         if (npages && old.npages && npages != old.npages)
635                 goto out_unlock;
636
637         /* Check for overlaps */
638         r = -EEXIST;
639         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
640                 struct kvm_memory_slot *s = &kvm->memslots[i];
641
642                 if (s == memslot)
643                         continue;
644                 if (!((base_gfn + npages <= s->base_gfn) ||
645                       (base_gfn >= s->base_gfn + s->npages)))
646                         goto out_unlock;
647         }
648         /*
649          * Do memory allocations outside lock.  memory_config_version will
650          * detect any races.
651          */
652         spin_unlock(&kvm->lock);
653
654         /* Deallocate if slot is being removed */
655         if (!npages)
656                 new.phys_mem = NULL;
657
658         /* Free page dirty bitmap if unneeded */
659         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
660                 new.dirty_bitmap = NULL;
661
662         r = -ENOMEM;
663
664         /* Allocate if a slot is being created */
665         if (npages && !new.phys_mem) {
666                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
667
668                 if (!new.phys_mem)
669                         goto out_free;
670
671                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
672                 for (i = 0; i < npages; ++i) {
673                         new.phys_mem[i] = alloc_page(GFP_HIGHUSER
674                                                      | __GFP_ZERO);
675                         if (!new.phys_mem[i])
676                                 goto out_free;
677                         new.phys_mem[i]->private = 0;
678                 }
679         }
680
681         /* Allocate page dirty bitmap if needed */
682         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
683                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
684
685                 new.dirty_bitmap = vmalloc(dirty_bytes);
686                 if (!new.dirty_bitmap)
687                         goto out_free;
688                 memset(new.dirty_bitmap, 0, dirty_bytes);
689         }
690
691         spin_lock(&kvm->lock);
692
693         if (memory_config_version != kvm->memory_config_version) {
694                 spin_unlock(&kvm->lock);
695                 kvm_free_physmem_slot(&new, &old);
696                 goto raced;
697         }
698
699         r = -EAGAIN;
700         if (kvm->busy)
701                 goto out_unlock;
702
703         if (mem->slot >= kvm->nmemslots)
704                 kvm->nmemslots = mem->slot + 1;
705
706         *memslot = new;
707         ++kvm->memory_config_version;
708
709         spin_unlock(&kvm->lock);
710
711         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
712                 struct kvm_vcpu *vcpu;
713
714                 vcpu = vcpu_load(kvm, i);
715                 if (!vcpu)
716                         continue;
717                 kvm_mmu_reset_context(vcpu);
718                 vcpu_put(vcpu);
719         }
720
721         kvm_free_physmem_slot(&old, &new);
722         return 0;
723
724 out_unlock:
725         spin_unlock(&kvm->lock);
726 out_free:
727         kvm_free_physmem_slot(&new, &old);
728 out:
729         return r;
730 }
731
732 static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
733 {
734         spin_lock(&vcpu->kvm->lock);
735         kvm_mmu_slot_remove_write_access(vcpu, slot);
736         spin_unlock(&vcpu->kvm->lock);
737 }
738
739 /*
740  * Get (and clear) the dirty memory log for a memory slot.
741  */
742 static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
743                                        struct kvm_dirty_log *log)
744 {
745         struct kvm_memory_slot *memslot;
746         int r, i;
747         int n;
748         int cleared;
749         unsigned long any = 0;
750
751         spin_lock(&kvm->lock);
752
753         /*
754          * Prevent changes to guest memory configuration even while the lock
755          * is not taken.
756          */
757         ++kvm->busy;
758         spin_unlock(&kvm->lock);
759         r = -EINVAL;
760         if (log->slot >= KVM_MEMORY_SLOTS)
761                 goto out;
762
763         memslot = &kvm->memslots[log->slot];
764         r = -ENOENT;
765         if (!memslot->dirty_bitmap)
766                 goto out;
767
768         n = ALIGN(memslot->npages, 8) / 8;
769
770         for (i = 0; !any && i < n; ++i)
771                 any = memslot->dirty_bitmap[i];
772
773         r = -EFAULT;
774         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
775                 goto out;
776
777
778         if (any) {
779                 cleared = 0;
780                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
781                         struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
782
783                         if (!vcpu)
784                                 continue;
785                         if (!cleared) {
786                                 do_remove_write_access(vcpu, log->slot);
787                                 memset(memslot->dirty_bitmap, 0, n);
788                                 cleared = 1;
789                         }
790                         kvm_arch_ops->tlb_flush(vcpu);
791                         vcpu_put(vcpu);
792                 }
793         }
794
795         r = 0;
796
797 out:
798         spin_lock(&kvm->lock);
799         --kvm->busy;
800         spin_unlock(&kvm->lock);
801         return r;
802 }
803
804 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
805 {
806         int i;
807
808         for (i = 0; i < kvm->nmemslots; ++i) {
809                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
810
811                 if (gfn >= memslot->base_gfn
812                     && gfn < memslot->base_gfn + memslot->npages)
813                         return memslot;
814         }
815         return NULL;
816 }
817 EXPORT_SYMBOL_GPL(gfn_to_memslot);
818
819 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
820 {
821         int i;
822         struct kvm_memory_slot *memslot = NULL;
823         unsigned long rel_gfn;
824
825         for (i = 0; i < kvm->nmemslots; ++i) {
826                 memslot = &kvm->memslots[i];
827
828                 if (gfn >= memslot->base_gfn
829                     && gfn < memslot->base_gfn + memslot->npages) {
830
831                         if (!memslot || !memslot->dirty_bitmap)
832                                 return;
833
834                         rel_gfn = gfn - memslot->base_gfn;
835
836                         /* avoid RMW */
837                         if (!test_bit(rel_gfn, memslot->dirty_bitmap))
838                                 set_bit(rel_gfn, memslot->dirty_bitmap);
839                         return;
840                 }
841         }
842 }
843
844 static int emulator_read_std(unsigned long addr,
845                              unsigned long *val,
846                              unsigned int bytes,
847                              struct x86_emulate_ctxt *ctxt)
848 {
849         struct kvm_vcpu *vcpu = ctxt->vcpu;
850         void *data = val;
851
852         while (bytes) {
853                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
854                 unsigned offset = addr & (PAGE_SIZE-1);
855                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
856                 unsigned long pfn;
857                 struct kvm_memory_slot *memslot;
858                 void *page;
859
860                 if (gpa == UNMAPPED_GVA)
861                         return X86EMUL_PROPAGATE_FAULT;
862                 pfn = gpa >> PAGE_SHIFT;
863                 memslot = gfn_to_memslot(vcpu->kvm, pfn);
864                 if (!memslot)
865                         return X86EMUL_UNHANDLEABLE;
866                 page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0);
867
868                 memcpy(data, page + offset, tocopy);
869
870                 kunmap_atomic(page, KM_USER0);
871
872                 bytes -= tocopy;
873                 data += tocopy;
874                 addr += tocopy;
875         }
876
877         return X86EMUL_CONTINUE;
878 }
879
880 static int emulator_write_std(unsigned long addr,
881                               unsigned long val,
882                               unsigned int bytes,
883                               struct x86_emulate_ctxt *ctxt)
884 {
885         printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
886                addr, bytes);
887         return X86EMUL_UNHANDLEABLE;
888 }
889
890 static int emulator_read_emulated(unsigned long addr,
891                                   unsigned long *val,
892                                   unsigned int bytes,
893                                   struct x86_emulate_ctxt *ctxt)
894 {
895         struct kvm_vcpu *vcpu = ctxt->vcpu;
896
897         if (vcpu->mmio_read_completed) {
898                 memcpy(val, vcpu->mmio_data, bytes);
899                 vcpu->mmio_read_completed = 0;
900                 return X86EMUL_CONTINUE;
901         } else if (emulator_read_std(addr, val, bytes, ctxt)
902                    == X86EMUL_CONTINUE)
903                 return X86EMUL_CONTINUE;
904         else {
905                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
906                 if (gpa == UNMAPPED_GVA)
907                         return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
908                 vcpu->mmio_needed = 1;
909                 vcpu->mmio_phys_addr = gpa;
910                 vcpu->mmio_size = bytes;
911                 vcpu->mmio_is_write = 0;
912
913                 return X86EMUL_UNHANDLEABLE;
914         }
915 }
916
917 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
918                                unsigned long val, int bytes)
919 {
920         struct kvm_memory_slot *m;
921         struct page *page;
922         void *virt;
923
924         if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
925                 return 0;
926         m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
927         if (!m)
928                 return 0;
929         page = gfn_to_page(m, gpa >> PAGE_SHIFT);
930         kvm_mmu_pre_write(vcpu, gpa, bytes);
931         virt = kmap_atomic(page, KM_USER0);
932         memcpy(virt + offset_in_page(gpa), &val, bytes);
933         kunmap_atomic(virt, KM_USER0);
934         kvm_mmu_post_write(vcpu, gpa, bytes);
935         return 1;
936 }
937
938 static int emulator_write_emulated(unsigned long addr,
939                                    unsigned long val,
940                                    unsigned int bytes,
941                                    struct x86_emulate_ctxt *ctxt)
942 {
943         struct kvm_vcpu *vcpu = ctxt->vcpu;
944         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
945
946         if (gpa == UNMAPPED_GVA)
947                 return X86EMUL_PROPAGATE_FAULT;
948
949         if (emulator_write_phys(vcpu, gpa, val, bytes))
950                 return X86EMUL_CONTINUE;
951
952         vcpu->mmio_needed = 1;
953         vcpu->mmio_phys_addr = gpa;
954         vcpu->mmio_size = bytes;
955         vcpu->mmio_is_write = 1;
956         memcpy(vcpu->mmio_data, &val, bytes);
957
958         return X86EMUL_CONTINUE;
959 }
960
961 static int emulator_cmpxchg_emulated(unsigned long addr,
962                                      unsigned long old,
963                                      unsigned long new,
964                                      unsigned int bytes,
965                                      struct x86_emulate_ctxt *ctxt)
966 {
967         static int reported;
968
969         if (!reported) {
970                 reported = 1;
971                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
972         }
973         return emulator_write_emulated(addr, new, bytes, ctxt);
974 }
975
976 #ifdef CONFIG_X86_32
977
978 static int emulator_cmpxchg8b_emulated(unsigned long addr,
979                                        unsigned long old_lo,
980                                        unsigned long old_hi,
981                                        unsigned long new_lo,
982                                        unsigned long new_hi,
983                                        struct x86_emulate_ctxt *ctxt)
984 {
985         static int reported;
986         int r;
987
988         if (!reported) {
989                 reported = 1;
990                 printk(KERN_WARNING "kvm: emulating exchange8b as write\n");
991         }
992         r = emulator_write_emulated(addr, new_lo, 4, ctxt);
993         if (r != X86EMUL_CONTINUE)
994                 return r;
995         return emulator_write_emulated(addr+4, new_hi, 4, ctxt);
996 }
997
998 #endif
999
1000 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1001 {
1002         return kvm_arch_ops->get_segment_base(vcpu, seg);
1003 }
1004
1005 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1006 {
1007         return X86EMUL_CONTINUE;
1008 }
1009
1010 int emulate_clts(struct kvm_vcpu *vcpu)
1011 {
1012         unsigned long cr0;
1013
1014         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1015         cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1016         kvm_arch_ops->set_cr0(vcpu, cr0);
1017         return X86EMUL_CONTINUE;
1018 }
1019
1020 int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1021 {
1022         struct kvm_vcpu *vcpu = ctxt->vcpu;
1023
1024         switch (dr) {
1025         case 0 ... 3:
1026                 *dest = kvm_arch_ops->get_dr(vcpu, dr);
1027                 return X86EMUL_CONTINUE;
1028         default:
1029                 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1030                        __FUNCTION__, dr);
1031                 return X86EMUL_UNHANDLEABLE;
1032         }
1033 }
1034
1035 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1036 {
1037         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1038         int exception;
1039
1040         kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1041         if (exception) {
1042                 /* FIXME: better handling */
1043                 return X86EMUL_UNHANDLEABLE;
1044         }
1045         return X86EMUL_CONTINUE;
1046 }
1047
1048 static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
1049 {
1050         static int reported;
1051         u8 opcodes[4];
1052         unsigned long rip = ctxt->vcpu->rip;
1053         unsigned long rip_linear;
1054
1055         rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
1056
1057         if (reported)
1058                 return;
1059
1060         emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
1061
1062         printk(KERN_ERR "emulation failed but !mmio_needed?"
1063                " rip %lx %02x %02x %02x %02x\n",
1064                rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1065         reported = 1;
1066 }
1067
1068 struct x86_emulate_ops emulate_ops = {
1069         .read_std            = emulator_read_std,
1070         .write_std           = emulator_write_std,
1071         .read_emulated       = emulator_read_emulated,
1072         .write_emulated      = emulator_write_emulated,
1073         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1074 #ifdef CONFIG_X86_32
1075         .cmpxchg8b_emulated  = emulator_cmpxchg8b_emulated,
1076 #endif
1077 };
1078
1079 int emulate_instruction(struct kvm_vcpu *vcpu,
1080                         struct kvm_run *run,
1081                         unsigned long cr2,
1082                         u16 error_code)
1083 {
1084         struct x86_emulate_ctxt emulate_ctxt;
1085         int r;
1086         int cs_db, cs_l;
1087
1088         kvm_arch_ops->cache_regs(vcpu);
1089
1090         kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1091
1092         emulate_ctxt.vcpu = vcpu;
1093         emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
1094         emulate_ctxt.cr2 = cr2;
1095         emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1096                 ? X86EMUL_MODE_REAL : cs_l
1097                 ? X86EMUL_MODE_PROT64 : cs_db
1098                 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1099
1100         if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1101                 emulate_ctxt.cs_base = 0;
1102                 emulate_ctxt.ds_base = 0;
1103                 emulate_ctxt.es_base = 0;
1104                 emulate_ctxt.ss_base = 0;
1105         } else {
1106                 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1107                 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1108                 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1109                 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1110         }
1111
1112         emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1113         emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1114
1115         vcpu->mmio_is_write = 0;
1116         r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1117
1118         if ((r || vcpu->mmio_is_write) && run) {
1119                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1120                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1121                 run->mmio.len = vcpu->mmio_size;
1122                 run->mmio.is_write = vcpu->mmio_is_write;
1123         }
1124
1125         if (r) {
1126                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1127                         return EMULATE_DONE;
1128                 if (!vcpu->mmio_needed) {
1129                         report_emulation_failure(&emulate_ctxt);
1130                         return EMULATE_FAIL;
1131                 }
1132                 return EMULATE_DO_MMIO;
1133         }
1134
1135         kvm_arch_ops->decache_regs(vcpu);
1136         kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1137
1138         if (vcpu->mmio_is_write)
1139                 return EMULATE_DO_MMIO;
1140
1141         return EMULATE_DONE;
1142 }
1143 EXPORT_SYMBOL_GPL(emulate_instruction);
1144
1145 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1146 {
1147         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1148 }
1149
1150 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1151 {
1152         struct descriptor_table dt = { limit, base };
1153
1154         kvm_arch_ops->set_gdt(vcpu, &dt);
1155 }
1156
1157 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1158 {
1159         struct descriptor_table dt = { limit, base };
1160
1161         kvm_arch_ops->set_idt(vcpu, &dt);
1162 }
1163
1164 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1165                    unsigned long *rflags)
1166 {
1167         lmsw(vcpu, msw);
1168         *rflags = kvm_arch_ops->get_rflags(vcpu);
1169 }
1170
1171 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1172 {
1173         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1174         switch (cr) {
1175         case 0:
1176                 return vcpu->cr0;
1177         case 2:
1178                 return vcpu->cr2;
1179         case 3:
1180                 return vcpu->cr3;
1181         case 4:
1182                 return vcpu->cr4;
1183         default:
1184                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1185                 return 0;
1186         }
1187 }
1188
1189 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1190                      unsigned long *rflags)
1191 {
1192         switch (cr) {
1193         case 0:
1194                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1195                 *rflags = kvm_arch_ops->get_rflags(vcpu);
1196                 break;
1197         case 2:
1198                 vcpu->cr2 = val;
1199                 break;
1200         case 3:
1201                 set_cr3(vcpu, val);
1202                 break;
1203         case 4:
1204                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1205                 break;
1206         default:
1207                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1208         }
1209 }
1210
1211 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1212 {
1213         u64 data;
1214
1215         switch (msr) {
1216         case 0xc0010010: /* SYSCFG */
1217         case 0xc0010015: /* HWCR */
1218         case MSR_IA32_PLATFORM_ID:
1219         case MSR_IA32_P5_MC_ADDR:
1220         case MSR_IA32_P5_MC_TYPE:
1221         case MSR_IA32_MC0_CTL:
1222         case MSR_IA32_MCG_STATUS:
1223         case MSR_IA32_MCG_CAP:
1224         case MSR_IA32_MC0_MISC:
1225         case MSR_IA32_MC0_MISC+4:
1226         case MSR_IA32_MC0_MISC+8:
1227         case MSR_IA32_MC0_MISC+12:
1228         case MSR_IA32_MC0_MISC+16:
1229         case MSR_IA32_UCODE_REV:
1230         case MSR_IA32_PERF_STATUS:
1231                 /* MTRR registers */
1232         case 0xfe:
1233         case 0x200 ... 0x2ff:
1234                 data = 0;
1235                 break;
1236         case 0xcd: /* fsb frequency */
1237                 data = 3;
1238                 break;
1239         case MSR_IA32_APICBASE:
1240                 data = vcpu->apic_base;
1241                 break;
1242         case MSR_IA32_MISC_ENABLE:
1243                 data = vcpu->ia32_misc_enable_msr;
1244                 break;
1245 #ifdef CONFIG_X86_64
1246         case MSR_EFER:
1247                 data = vcpu->shadow_efer;
1248                 break;
1249 #endif
1250         default:
1251                 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr);
1252                 return 1;
1253         }
1254         *pdata = data;
1255         return 0;
1256 }
1257 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1258
1259 /*
1260  * Reads an msr value (of 'msr_index') into 'pdata'.
1261  * Returns 0 on success, non-0 otherwise.
1262  * Assumes vcpu_load() was already called.
1263  */
1264 static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1265 {
1266         return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1267 }
1268
1269 #ifdef CONFIG_X86_64
1270
1271 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1272 {
1273         if (efer & EFER_RESERVED_BITS) {
1274                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1275                        efer);
1276                 inject_gp(vcpu);
1277                 return;
1278         }
1279
1280         if (is_paging(vcpu)
1281             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1282                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1283                 inject_gp(vcpu);
1284                 return;
1285         }
1286
1287         kvm_arch_ops->set_efer(vcpu, efer);
1288
1289         efer &= ~EFER_LMA;
1290         efer |= vcpu->shadow_efer & EFER_LMA;
1291
1292         vcpu->shadow_efer = efer;
1293 }
1294
1295 #endif
1296
1297 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1298 {
1299         switch (msr) {
1300 #ifdef CONFIG_X86_64
1301         case MSR_EFER:
1302                 set_efer(vcpu, data);
1303                 break;
1304 #endif
1305         case MSR_IA32_MC0_STATUS:
1306                 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1307                        __FUNCTION__, data);
1308                 break;
1309         case MSR_IA32_UCODE_REV:
1310         case MSR_IA32_UCODE_WRITE:
1311         case 0x200 ... 0x2ff: /* MTRRs */
1312                 break;
1313         case MSR_IA32_APICBASE:
1314                 vcpu->apic_base = data;
1315                 break;
1316         case MSR_IA32_MISC_ENABLE:
1317                 vcpu->ia32_misc_enable_msr = data;
1318                 break;
1319         default:
1320                 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
1321                 return 1;
1322         }
1323         return 0;
1324 }
1325 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1326
1327 /*
1328  * Writes msr value into into the appropriate "register".
1329  * Returns 0 on success, non-0 otherwise.
1330  * Assumes vcpu_load() was already called.
1331  */
1332 static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1333 {
1334         return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1335 }
1336
1337 void kvm_resched(struct kvm_vcpu *vcpu)
1338 {
1339         vcpu_put(vcpu);
1340         cond_resched();
1341         /* Cannot fail -  no vcpu unplug yet. */
1342         vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
1343 }
1344 EXPORT_SYMBOL_GPL(kvm_resched);
1345
1346 void load_msrs(struct vmx_msr_entry *e, int n)
1347 {
1348         int i;
1349
1350         for (i = 0; i < n; ++i)
1351                 wrmsrl(e[i].index, e[i].data);
1352 }
1353 EXPORT_SYMBOL_GPL(load_msrs);
1354
1355 void save_msrs(struct vmx_msr_entry *e, int n)
1356 {
1357         int i;
1358
1359         for (i = 0; i < n; ++i)
1360                 rdmsrl(e[i].index, e[i].data);
1361 }
1362 EXPORT_SYMBOL_GPL(save_msrs);
1363
1364 static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run)
1365 {
1366         struct kvm_vcpu *vcpu;
1367         int r;
1368
1369         if (!valid_vcpu(kvm_run->vcpu))
1370                 return -EINVAL;
1371
1372         vcpu = vcpu_load(kvm, kvm_run->vcpu);
1373         if (!vcpu)
1374                 return -ENOENT;
1375
1376         /* re-sync apic's tpr */
1377         vcpu->cr8 = kvm_run->cr8;
1378
1379         if (kvm_run->emulated) {
1380                 kvm_arch_ops->skip_emulated_instruction(vcpu);
1381                 kvm_run->emulated = 0;
1382         }
1383
1384         if (kvm_run->mmio_completed) {
1385                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1386                 vcpu->mmio_read_completed = 1;
1387         }
1388
1389         vcpu->mmio_needed = 0;
1390
1391         r = kvm_arch_ops->run(vcpu, kvm_run);
1392
1393         vcpu_put(vcpu);
1394         return r;
1395 }
1396
1397 static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs)
1398 {
1399         struct kvm_vcpu *vcpu;
1400
1401         if (!valid_vcpu(regs->vcpu))
1402                 return -EINVAL;
1403
1404         vcpu = vcpu_load(kvm, regs->vcpu);
1405         if (!vcpu)
1406                 return -ENOENT;
1407
1408         kvm_arch_ops->cache_regs(vcpu);
1409
1410         regs->rax = vcpu->regs[VCPU_REGS_RAX];
1411         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1412         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1413         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1414         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1415         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1416         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1417         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1418 #ifdef CONFIG_X86_64
1419         regs->r8 = vcpu->regs[VCPU_REGS_R8];
1420         regs->r9 = vcpu->regs[VCPU_REGS_R9];
1421         regs->r10 = vcpu->regs[VCPU_REGS_R10];
1422         regs->r11 = vcpu->regs[VCPU_REGS_R11];
1423         regs->r12 = vcpu->regs[VCPU_REGS_R12];
1424         regs->r13 = vcpu->regs[VCPU_REGS_R13];
1425         regs->r14 = vcpu->regs[VCPU_REGS_R14];
1426         regs->r15 = vcpu->regs[VCPU_REGS_R15];
1427 #endif
1428
1429         regs->rip = vcpu->rip;
1430         regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1431
1432         /*
1433          * Don't leak debug flags in case they were set for guest debugging
1434          */
1435         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1436                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1437
1438         vcpu_put(vcpu);
1439
1440         return 0;
1441 }
1442
1443 static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs)
1444 {
1445         struct kvm_vcpu *vcpu;
1446
1447         if (!valid_vcpu(regs->vcpu))
1448                 return -EINVAL;
1449
1450         vcpu = vcpu_load(kvm, regs->vcpu);
1451         if (!vcpu)
1452                 return -ENOENT;
1453
1454         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1455         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1456         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1457         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1458         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1459         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1460         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1461         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1462 #ifdef CONFIG_X86_64
1463         vcpu->regs[VCPU_REGS_R8] = regs->r8;
1464         vcpu->regs[VCPU_REGS_R9] = regs->r9;
1465         vcpu->regs[VCPU_REGS_R10] = regs->r10;
1466         vcpu->regs[VCPU_REGS_R11] = regs->r11;
1467         vcpu->regs[VCPU_REGS_R12] = regs->r12;
1468         vcpu->regs[VCPU_REGS_R13] = regs->r13;
1469         vcpu->regs[VCPU_REGS_R14] = regs->r14;
1470         vcpu->regs[VCPU_REGS_R15] = regs->r15;
1471 #endif
1472
1473         vcpu->rip = regs->rip;
1474         kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1475
1476         kvm_arch_ops->decache_regs(vcpu);
1477
1478         vcpu_put(vcpu);
1479
1480         return 0;
1481 }
1482
1483 static void get_segment(struct kvm_vcpu *vcpu,
1484                         struct kvm_segment *var, int seg)
1485 {
1486         return kvm_arch_ops->get_segment(vcpu, var, seg);
1487 }
1488
1489 static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1490 {
1491         struct kvm_vcpu *vcpu;
1492         struct descriptor_table dt;
1493
1494         if (!valid_vcpu(sregs->vcpu))
1495                 return -EINVAL;
1496         vcpu = vcpu_load(kvm, sregs->vcpu);
1497         if (!vcpu)
1498                 return -ENOENT;
1499
1500         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1501         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1502         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1503         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1504         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1505         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1506
1507         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1508         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1509
1510         kvm_arch_ops->get_idt(vcpu, &dt);
1511         sregs->idt.limit = dt.limit;
1512         sregs->idt.base = dt.base;
1513         kvm_arch_ops->get_gdt(vcpu, &dt);
1514         sregs->gdt.limit = dt.limit;
1515         sregs->gdt.base = dt.base;
1516
1517         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1518         sregs->cr0 = vcpu->cr0;
1519         sregs->cr2 = vcpu->cr2;
1520         sregs->cr3 = vcpu->cr3;
1521         sregs->cr4 = vcpu->cr4;
1522         sregs->cr8 = vcpu->cr8;
1523         sregs->efer = vcpu->shadow_efer;
1524         sregs->apic_base = vcpu->apic_base;
1525
1526         memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1527                sizeof sregs->interrupt_bitmap);
1528
1529         vcpu_put(vcpu);
1530
1531         return 0;
1532 }
1533
1534 static void set_segment(struct kvm_vcpu *vcpu,
1535                         struct kvm_segment *var, int seg)
1536 {
1537         return kvm_arch_ops->set_segment(vcpu, var, seg);
1538 }
1539
1540 static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1541 {
1542         struct kvm_vcpu *vcpu;
1543         int mmu_reset_needed = 0;
1544         int i;
1545         struct descriptor_table dt;
1546
1547         if (!valid_vcpu(sregs->vcpu))
1548                 return -EINVAL;
1549         vcpu = vcpu_load(kvm, sregs->vcpu);
1550         if (!vcpu)
1551                 return -ENOENT;
1552
1553         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1554         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1555         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1556         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1557         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1558         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1559
1560         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1561         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1562
1563         dt.limit = sregs->idt.limit;
1564         dt.base = sregs->idt.base;
1565         kvm_arch_ops->set_idt(vcpu, &dt);
1566         dt.limit = sregs->gdt.limit;
1567         dt.base = sregs->gdt.base;
1568         kvm_arch_ops->set_gdt(vcpu, &dt);
1569
1570         vcpu->cr2 = sregs->cr2;
1571         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1572         vcpu->cr3 = sregs->cr3;
1573
1574         vcpu->cr8 = sregs->cr8;
1575
1576         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1577 #ifdef CONFIG_X86_64
1578         kvm_arch_ops->set_efer(vcpu, sregs->efer);
1579 #endif
1580         vcpu->apic_base = sregs->apic_base;
1581
1582         kvm_arch_ops->decache_cr0_cr4_guest_bits(vcpu);
1583
1584         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1585         kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
1586
1587         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1588         kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
1589         if (!is_long_mode(vcpu) && is_pae(vcpu))
1590                 load_pdptrs(vcpu, vcpu->cr3);
1591
1592         if (mmu_reset_needed)
1593                 kvm_mmu_reset_context(vcpu);
1594
1595         memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1596                sizeof vcpu->irq_pending);
1597         vcpu->irq_summary = 0;
1598         for (i = 0; i < NR_IRQ_WORDS; ++i)
1599                 if (vcpu->irq_pending[i])
1600                         __set_bit(i, &vcpu->irq_summary);
1601
1602         vcpu_put(vcpu);
1603
1604         return 0;
1605 }
1606
1607 /*
1608  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1609  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1610  *
1611  * This list is modified at module load time to reflect the
1612  * capabilities of the host cpu.
1613  */
1614 static u32 msrs_to_save[] = {
1615         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1616         MSR_K6_STAR,
1617 #ifdef CONFIG_X86_64
1618         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1619 #endif
1620         MSR_IA32_TIME_STAMP_COUNTER,
1621 };
1622
1623 static unsigned num_msrs_to_save;
1624
1625 static u32 emulated_msrs[] = {
1626         MSR_IA32_MISC_ENABLE,
1627 };
1628
1629 static __init void kvm_init_msr_list(void)
1630 {
1631         u32 dummy[2];
1632         unsigned i, j;
1633
1634         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1635                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1636                         continue;
1637                 if (j < i)
1638                         msrs_to_save[j] = msrs_to_save[i];
1639                 j++;
1640         }
1641         num_msrs_to_save = j;
1642 }
1643
1644 /*
1645  * Adapt set_msr() to msr_io()'s calling convention
1646  */
1647 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1648 {
1649         return set_msr(vcpu, index, *data);
1650 }
1651
1652 /*
1653  * Read or write a bunch of msrs. All parameters are kernel addresses.
1654  *
1655  * @return number of msrs set successfully.
1656  */
1657 static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs,
1658                     struct kvm_msr_entry *entries,
1659                     int (*do_msr)(struct kvm_vcpu *vcpu,
1660                                   unsigned index, u64 *data))
1661 {
1662         struct kvm_vcpu *vcpu;
1663         int i;
1664
1665         if (!valid_vcpu(msrs->vcpu))
1666                 return -EINVAL;
1667
1668         vcpu = vcpu_load(kvm, msrs->vcpu);
1669         if (!vcpu)
1670                 return -ENOENT;
1671
1672         for (i = 0; i < msrs->nmsrs; ++i)
1673                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1674                         break;
1675
1676         vcpu_put(vcpu);
1677
1678         return i;
1679 }
1680
1681 /*
1682  * Read or write a bunch of msrs. Parameters are user addresses.
1683  *
1684  * @return number of msrs set successfully.
1685  */
1686 static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs,
1687                   int (*do_msr)(struct kvm_vcpu *vcpu,
1688                                 unsigned index, u64 *data),
1689                   int writeback)
1690 {
1691         struct kvm_msrs msrs;
1692         struct kvm_msr_entry *entries;
1693         int r, n;
1694         unsigned size;
1695
1696         r = -EFAULT;
1697         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1698                 goto out;
1699
1700         r = -E2BIG;
1701         if (msrs.nmsrs >= MAX_IO_MSRS)
1702                 goto out;
1703
1704         r = -ENOMEM;
1705         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1706         entries = vmalloc(size);
1707         if (!entries)
1708                 goto out;
1709
1710         r = -EFAULT;
1711         if (copy_from_user(entries, user_msrs->entries, size))
1712                 goto out_free;
1713
1714         r = n = __msr_io(kvm, &msrs, entries, do_msr);
1715         if (r < 0)
1716                 goto out_free;
1717
1718         r = -EFAULT;
1719         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1720                 goto out_free;
1721
1722         r = n;
1723
1724 out_free:
1725         vfree(entries);
1726 out:
1727         return r;
1728 }
1729
1730 /*
1731  * Translate a guest virtual address to a guest physical address.
1732  */
1733 static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr)
1734 {
1735         unsigned long vaddr = tr->linear_address;
1736         struct kvm_vcpu *vcpu;
1737         gpa_t gpa;
1738
1739         vcpu = vcpu_load(kvm, tr->vcpu);
1740         if (!vcpu)
1741                 return -ENOENT;
1742         spin_lock(&kvm->lock);
1743         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1744         tr->physical_address = gpa;
1745         tr->valid = gpa != UNMAPPED_GVA;
1746         tr->writeable = 1;
1747         tr->usermode = 0;
1748         spin_unlock(&kvm->lock);
1749         vcpu_put(vcpu);
1750
1751         return 0;
1752 }
1753
1754 static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq)
1755 {
1756         struct kvm_vcpu *vcpu;
1757
1758         if (!valid_vcpu(irq->vcpu))
1759                 return -EINVAL;
1760         if (irq->irq < 0 || irq->irq >= 256)
1761                 return -EINVAL;
1762         vcpu = vcpu_load(kvm, irq->vcpu);
1763         if (!vcpu)
1764                 return -ENOENT;
1765
1766         set_bit(irq->irq, vcpu->irq_pending);
1767         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1768
1769         vcpu_put(vcpu);
1770
1771         return 0;
1772 }
1773
1774 static int kvm_dev_ioctl_debug_guest(struct kvm *kvm,
1775                                      struct kvm_debug_guest *dbg)
1776 {
1777         struct kvm_vcpu *vcpu;
1778         int r;
1779
1780         if (!valid_vcpu(dbg->vcpu))
1781                 return -EINVAL;
1782         vcpu = vcpu_load(kvm, dbg->vcpu);
1783         if (!vcpu)
1784                 return -ENOENT;
1785
1786         r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
1787
1788         vcpu_put(vcpu);
1789
1790         return r;
1791 }
1792
1793 static long kvm_dev_ioctl(struct file *filp,
1794                           unsigned int ioctl, unsigned long arg)
1795 {
1796         struct kvm *kvm = filp->private_data;
1797         void __user *argp = (void __user *)arg;
1798         int r = -EINVAL;
1799
1800         switch (ioctl) {
1801         case KVM_GET_API_VERSION:
1802                 r = KVM_API_VERSION;
1803                 break;
1804         case KVM_CREATE_VCPU: {
1805                 r = kvm_dev_ioctl_create_vcpu(kvm, arg);
1806                 if (r)
1807                         goto out;
1808                 break;
1809         }
1810         case KVM_RUN: {
1811                 struct kvm_run kvm_run;
1812
1813                 r = -EFAULT;
1814                 if (copy_from_user(&kvm_run, argp, sizeof kvm_run))
1815                         goto out;
1816                 r = kvm_dev_ioctl_run(kvm, &kvm_run);
1817                 if (r < 0 &&  r != -EINTR)
1818                         goto out;
1819                 if (copy_to_user(argp, &kvm_run, sizeof kvm_run)) {
1820                         r = -EFAULT;
1821                         goto out;
1822                 }
1823                 break;
1824         }
1825         case KVM_GET_REGS: {
1826                 struct kvm_regs kvm_regs;
1827
1828                 r = -EFAULT;
1829                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1830                         goto out;
1831                 r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs);
1832                 if (r)
1833                         goto out;
1834                 r = -EFAULT;
1835                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
1836                         goto out;
1837                 r = 0;
1838                 break;
1839         }
1840         case KVM_SET_REGS: {
1841                 struct kvm_regs kvm_regs;
1842
1843                 r = -EFAULT;
1844                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
1845                         goto out;
1846                 r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs);
1847                 if (r)
1848                         goto out;
1849                 r = 0;
1850                 break;
1851         }
1852         case KVM_GET_SREGS: {
1853                 struct kvm_sregs kvm_sregs;
1854
1855                 r = -EFAULT;
1856                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1857                         goto out;
1858                 r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs);
1859                 if (r)
1860                         goto out;
1861                 r = -EFAULT;
1862                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
1863                         goto out;
1864                 r = 0;
1865                 break;
1866         }
1867         case KVM_SET_SREGS: {
1868                 struct kvm_sregs kvm_sregs;
1869
1870                 r = -EFAULT;
1871                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
1872                         goto out;
1873                 r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs);
1874                 if (r)
1875                         goto out;
1876                 r = 0;
1877                 break;
1878         }
1879         case KVM_TRANSLATE: {
1880                 struct kvm_translation tr;
1881
1882                 r = -EFAULT;
1883                 if (copy_from_user(&tr, argp, sizeof tr))
1884                         goto out;
1885                 r = kvm_dev_ioctl_translate(kvm, &tr);
1886                 if (r)
1887                         goto out;
1888                 r = -EFAULT;
1889                 if (copy_to_user(argp, &tr, sizeof tr))
1890                         goto out;
1891                 r = 0;
1892                 break;
1893         }
1894         case KVM_INTERRUPT: {
1895                 struct kvm_interrupt irq;
1896
1897                 r = -EFAULT;
1898                 if (copy_from_user(&irq, argp, sizeof irq))
1899                         goto out;
1900                 r = kvm_dev_ioctl_interrupt(kvm, &irq);
1901                 if (r)
1902                         goto out;
1903                 r = 0;
1904                 break;
1905         }
1906         case KVM_DEBUG_GUEST: {
1907                 struct kvm_debug_guest dbg;
1908
1909                 r = -EFAULT;
1910                 if (copy_from_user(&dbg, argp, sizeof dbg))
1911                         goto out;
1912                 r = kvm_dev_ioctl_debug_guest(kvm, &dbg);
1913                 if (r)
1914                         goto out;
1915                 r = 0;
1916                 break;
1917         }
1918         case KVM_SET_MEMORY_REGION: {
1919                 struct kvm_memory_region kvm_mem;
1920
1921                 r = -EFAULT;
1922                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1923                         goto out;
1924                 r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem);
1925                 if (r)
1926                         goto out;
1927                 break;
1928         }
1929         case KVM_GET_DIRTY_LOG: {
1930                 struct kvm_dirty_log log;
1931
1932                 r = -EFAULT;
1933                 if (copy_from_user(&log, argp, sizeof log))
1934                         goto out;
1935                 r = kvm_dev_ioctl_get_dirty_log(kvm, &log);
1936                 if (r)
1937                         goto out;
1938                 break;
1939         }
1940         case KVM_GET_MSRS:
1941                 r = msr_io(kvm, argp, get_msr, 1);
1942                 break;
1943         case KVM_SET_MSRS:
1944                 r = msr_io(kvm, argp, do_set_msr, 0);
1945                 break;
1946         case KVM_GET_MSR_INDEX_LIST: {
1947                 struct kvm_msr_list __user *user_msr_list = argp;
1948                 struct kvm_msr_list msr_list;
1949                 unsigned n;
1950
1951                 r = -EFAULT;
1952                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1953                         goto out;
1954                 n = msr_list.nmsrs;
1955                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1956                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1957                         goto out;
1958                 r = -E2BIG;
1959                 if (n < num_msrs_to_save)
1960                         goto out;
1961                 r = -EFAULT;
1962                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1963                                  num_msrs_to_save * sizeof(u32)))
1964                         goto out;
1965                 if (copy_to_user(user_msr_list->indices
1966                                  + num_msrs_to_save * sizeof(u32),
1967                                  &emulated_msrs,
1968                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1969                         goto out;
1970                 r = 0;
1971                 break;
1972         }
1973         default:
1974                 ;
1975         }
1976 out:
1977         return r;
1978 }
1979
1980 static struct page *kvm_dev_nopage(struct vm_area_struct *vma,
1981                                    unsigned long address,
1982                                    int *type)
1983 {
1984         struct kvm *kvm = vma->vm_file->private_data;
1985         unsigned long pgoff;
1986         struct kvm_memory_slot *slot;
1987         struct page *page;
1988
1989         *type = VM_FAULT_MINOR;
1990         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1991         slot = gfn_to_memslot(kvm, pgoff);
1992         if (!slot)
1993                 return NOPAGE_SIGBUS;
1994         page = gfn_to_page(slot, pgoff);
1995         if (!page)
1996                 return NOPAGE_SIGBUS;
1997         get_page(page);
1998         return page;
1999 }
2000
2001 static struct vm_operations_struct kvm_dev_vm_ops = {
2002         .nopage = kvm_dev_nopage,
2003 };
2004
2005 static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma)
2006 {
2007         vma->vm_ops = &kvm_dev_vm_ops;
2008         return 0;
2009 }
2010
2011 static struct file_operations kvm_chardev_ops = {
2012         .open           = kvm_dev_open,
2013         .release        = kvm_dev_release,
2014         .unlocked_ioctl = kvm_dev_ioctl,
2015         .compat_ioctl   = kvm_dev_ioctl,
2016         .mmap           = kvm_dev_mmap,
2017 };
2018
2019 static struct miscdevice kvm_dev = {
2020         MISC_DYNAMIC_MINOR,
2021         "kvm",
2022         &kvm_chardev_ops,
2023 };
2024
2025 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2026                        void *v)
2027 {
2028         if (val == SYS_RESTART) {
2029                 /*
2030                  * Some (well, at least mine) BIOSes hang on reboot if
2031                  * in vmx root mode.
2032                  */
2033                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2034                 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2035         }
2036         return NOTIFY_OK;
2037 }
2038
2039 static struct notifier_block kvm_reboot_notifier = {
2040         .notifier_call = kvm_reboot,
2041         .priority = 0,
2042 };
2043
2044 /*
2045  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2046  * cached on it.
2047  */
2048 static void decache_vcpus_on_cpu(int cpu)
2049 {
2050         struct kvm *vm;
2051         struct kvm_vcpu *vcpu;
2052         int i;
2053
2054         spin_lock(&kvm_lock);
2055         list_for_each_entry(vm, &vm_list, vm_list)
2056                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2057                         vcpu = &vm->vcpus[i];
2058                         /*
2059                          * If the vcpu is locked, then it is running on some
2060                          * other cpu and therefore it is not cached on the
2061                          * cpu in question.
2062                          *
2063                          * If it's not locked, check the last cpu it executed
2064                          * on.
2065                          */
2066                         if (mutex_trylock(&vcpu->mutex)) {
2067                                 if (vcpu->cpu == cpu) {
2068                                         kvm_arch_ops->vcpu_decache(vcpu);
2069                                         vcpu->cpu = -1;
2070                                 }
2071                                 mutex_unlock(&vcpu->mutex);
2072                         }
2073                 }
2074         spin_unlock(&kvm_lock);
2075 }
2076
2077 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2078                            void *v)
2079 {
2080         int cpu = (long)v;
2081
2082         switch (val) {
2083         case CPU_DEAD:
2084         case CPU_UP_CANCELED:
2085                 decache_vcpus_on_cpu(cpu);
2086                 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
2087                                          NULL, 0, 1);
2088                 break;
2089         case CPU_UP_PREPARE:
2090                 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
2091                                          NULL, 0, 1);
2092                 break;
2093         }
2094         return NOTIFY_OK;
2095 }
2096
2097 static struct notifier_block kvm_cpu_notifier = {
2098         .notifier_call = kvm_cpu_hotplug,
2099         .priority = 20, /* must be > scheduler priority */
2100 };
2101
2102 static __init void kvm_init_debug(void)
2103 {
2104         struct kvm_stats_debugfs_item *p;
2105
2106         debugfs_dir = debugfs_create_dir("kvm", NULL);
2107         for (p = debugfs_entries; p->name; ++p)
2108                 p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir,
2109                                                p->data);
2110 }
2111
2112 static void kvm_exit_debug(void)
2113 {
2114         struct kvm_stats_debugfs_item *p;
2115
2116         for (p = debugfs_entries; p->name; ++p)
2117                 debugfs_remove(p->dentry);
2118         debugfs_remove(debugfs_dir);
2119 }
2120
2121 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2122 {
2123         decache_vcpus_on_cpu(raw_smp_processor_id());
2124         on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
2125         return 0;
2126 }
2127
2128 static int kvm_resume(struct sys_device *dev)
2129 {
2130         on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
2131         return 0;
2132 }
2133
2134 static struct sysdev_class kvm_sysdev_class = {
2135         set_kset_name("kvm"),
2136         .suspend = kvm_suspend,
2137         .resume = kvm_resume,
2138 };
2139
2140 static struct sys_device kvm_sysdev = {
2141         .id = 0,
2142         .cls = &kvm_sysdev_class,
2143 };
2144
2145 hpa_t bad_page_address;
2146
2147 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
2148 {
2149         int r;
2150
2151         if (kvm_arch_ops) {
2152                 printk(KERN_ERR "kvm: already loaded the other module\n");
2153                 return -EEXIST;
2154         }
2155
2156         if (!ops->cpu_has_kvm_support()) {
2157                 printk(KERN_ERR "kvm: no hardware support\n");
2158                 return -EOPNOTSUPP;
2159         }
2160         if (ops->disabled_by_bios()) {
2161                 printk(KERN_ERR "kvm: disabled by bios\n");
2162                 return -EOPNOTSUPP;
2163         }
2164
2165         kvm_arch_ops = ops;
2166
2167         r = kvm_arch_ops->hardware_setup();
2168         if (r < 0)
2169             return r;
2170
2171         on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1);
2172         r = register_cpu_notifier(&kvm_cpu_notifier);
2173         if (r)
2174                 goto out_free_1;
2175         register_reboot_notifier(&kvm_reboot_notifier);
2176
2177         r = sysdev_class_register(&kvm_sysdev_class);
2178         if (r)
2179                 goto out_free_2;
2180
2181         r = sysdev_register(&kvm_sysdev);
2182         if (r)
2183                 goto out_free_3;
2184
2185         kvm_chardev_ops.owner = module;
2186
2187         r = misc_register(&kvm_dev);
2188         if (r) {
2189                 printk (KERN_ERR "kvm: misc device register failed\n");
2190                 goto out_free;
2191         }
2192
2193         return r;
2194
2195 out_free:
2196         sysdev_unregister(&kvm_sysdev);
2197 out_free_3:
2198         sysdev_class_unregister(&kvm_sysdev_class);
2199 out_free_2:
2200         unregister_reboot_notifier(&kvm_reboot_notifier);
2201         unregister_cpu_notifier(&kvm_cpu_notifier);
2202 out_free_1:
2203         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2204         kvm_arch_ops->hardware_unsetup();
2205         return r;
2206 }
2207
2208 void kvm_exit_arch(void)
2209 {
2210         misc_deregister(&kvm_dev);
2211         sysdev_unregister(&kvm_sysdev);
2212         sysdev_class_unregister(&kvm_sysdev_class);
2213         unregister_reboot_notifier(&kvm_reboot_notifier);
2214         unregister_cpu_notifier(&kvm_cpu_notifier);
2215         on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
2216         kvm_arch_ops->hardware_unsetup();
2217         kvm_arch_ops = NULL;
2218 }
2219
2220 static __init int kvm_init(void)
2221 {
2222         static struct page *bad_page;
2223         int r = 0;
2224
2225         kvm_init_debug();
2226
2227         kvm_init_msr_list();
2228
2229         if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
2230                 r = -ENOMEM;
2231                 goto out;
2232         }
2233
2234         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
2235         memset(__va(bad_page_address), 0, PAGE_SIZE);
2236
2237         return r;
2238
2239 out:
2240         kvm_exit_debug();
2241         return r;
2242 }
2243
2244 static __exit void kvm_exit(void)
2245 {
2246         kvm_exit_debug();
2247         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
2248 }
2249
2250 module_init(kvm_init)
2251 module_exit(kvm_exit)
2252
2253 EXPORT_SYMBOL_GPL(kvm_init_arch);
2254 EXPORT_SYMBOL_GPL(kvm_exit_arch);