[原创]QEMU/KVM虚拟机运行核心流程
2022-9-7 12:23:0 Author: bbs.pediy.com(查看原文) 阅读量:15 收藏

这几天学习了虚拟机在创建和运行过程中,QEMU和KVM的核心执行流程。当然只是大概过程,并没有做到流程中的每个函数都分析。
很喜欢侯捷老师的一句话:源码之前,了无秘密。我阅读的源码是qemu-6.2.0和linux-5.15.39。

编译安装qemu的过程很简单,参考官方文档就行。
可以直接用gdb命令行调试qemu,也可以vscode搭配gdb,调试属于基本能力,不多说。
动态调试qemu,并结合qemu源码分析流程。
启动参数如下:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

void qemu_init(int argc, char **argv, char **envp)

{

    //...

    // 对参数进行解析

    for(;;) {

        if (optind >= argc)

            break;

        if (argv[optind][0] != '-') {

            loc_set_cmdline(argv, optind, 1);

            drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);

        } else {

            const QEMUOption *popt;

            popt = lookup_opt(argc, argv, &optarg, &optind);

            if (!(popt->arch_mask & arch_type)) {

                error_report("Option not supported for this target");

                exit(1);

            }

            switch(popt->index) {

            case QEMU_OPTION_cpu:

                /* hw initialization will check this */

                cpu_option = optarg;

                break;

            //...

            // 主要关注下面几个参数

            case QEMU_OPTION_m:

                opts = qemu_opts_parse_noisily(qemu_find_opts("memory"),

                                               optarg, true);

                if (!opts) {

                    exit(EXIT_FAILURE);

                }

                break;

            case QEMU_OPTION_enable_kvm:

                qdict_put_str(machine_opts_dict, "accel", "kvm");

                break;

            case QEMU_OPTION_M:

            case QEMU_OPTION_machine:

                {

                    bool help;

                    keyval_parse_into(machine_opts_dict, optarg, "type", &help, &error_fatal);

                    if (help) {

                        machine_help_func(machine_opts_dict);

                        exit(EXIT_SUCCESS);

                    }

                    break;

                }

            case QEMU_OPTION_smp:

                machine_parse_property_opt(qemu_find_opts("smp-opts"),

                                           "smp", optarg);

                break;

            }

        }

    }

    //...

    // 根据accel设置accelerators = kvm

    qemu_apply_legacy_machine_options(machine_opts_dict);

    qemu_apply_machine_options(machine_opts_dict);

    // 也会根据进程名判断可用的加速类型

    configure_accelerators(argv[0]);

    // 内部调用了do_configure_accelerator --> accel_init_machine

    // accel_init_machine --> kvm_init

    // 初始化具体的accel类(这里是kvm)

    // 在qemu-6.2.0/accel/kvm/kvm-all.c line 3629

    // 函数kvm_accel_class_init内部找到真正的初始化函数

    // ac->init_machine = kvm_init;

    //...

    // 在qmp_x_exit_preconfig与虚拟cpu创建有关

    if (!preconfig_requested) {

        qmp_x_exit_preconfig(&error_fatal);

    }

    qemu_init_displays();

    // 设置accel

    accel_setup_post(current_machine);

    os_setup_post();

    resume_mux_open();

}

在2.1节中有提到,qmp_x_exit_preconfig函数与虚拟cpu的创建有关。
动态调试跟踪分析
qmp_x_exit_preconfig qemu-6.2.0\softmmu\vl.c:2740
--> qemu_init_board qemu-6.2.0\softmmu\vl.c:2652
--> machine_run_board_init qemu-6.2.0\hw\core\machine.c:1181
--> pc_q35_init qemu-6.2.0\hw\i386\pc_q35.c:182
--> x86_cpus_init qemu-6.2.0\hw\i386\x86.c:141
--> x86_cpu_new qemu-6.2.0\hw\i386\x86.c:114

在machine_run_board_init函数中根据参数中给的机器类型调用不同的pc_machine_init函数
machine_class->init(machine)----pc_q35_init

在x86_cpu_new中继续虚拟cpu的创建
x86_cpu_new
--> qdev_realize qemu-6.2.0\hw\core\qdev.c:333
--> device_set_realized qemu-6.2.0\hw\core\qdev.c:531
--> x86_cpu_realizefn qemu-6.2.0\target\i386\cpu.c:6447
--> qemu_init_vcpu qemu-6.2.0\softmmu\cpus.c:613
在x86_cpu_realizefn中调用qemu_init_vcpu对创建的虚拟cpu进行初始化

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

void qemu_init_vcpu(CPUState *cpu)

{

    //...

    // 调用函数kvm_start_vcpu_thread创建虚拟cpu执行线程

    cpus_accel->create_vcpu_thread(cpu);

    //...

}

static void kvm_start_vcpu_thread(CPUState *cpu)

{

    //...

    // 线程函数kvm_vcpu_thread_fn

    qemu_thread_create(cpu->thread, thread_name, kvm_vcpu_thread_fn,

                       cpu, QEMU_THREAD_JOINABLE);

    //...

}

static void *kvm_vcpu_thread_fn(void *arg)

{

    //...

    // kvm_init_vcpu中通过kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id)

    // 获取了vcpu描述符 cpu->kvm_fd = ret;

    r = kvm_init_vcpu(cpu, &error_fatal);

    kvm_init_cpu_signals(cpu);

    /* signal CPU creation */

    cpu_thread_signal_created(cpu);

    qemu_guest_random_seed_thread_part2(cpu->random_seed);

    // do while循环执行kvm_cpu_exec

    do {

        if (cpu_can_run(cpu)) {

            r = kvm_cpu_exec(cpu);

            if (r == EXCP_DEBUG) {

                cpu_handle_guest_debug(cpu);

            }

        }

        qemu_wait_io_event(cpu);

    } while (!cpu->unplug || cpu_can_run(cpu));

    kvm_destroy_vcpu(cpu);

    cpu_thread_signal_destroyed(cpu);

    qemu_mutex_unlock_iothread();

    rcu_unregister_thread();

    return NULL;

}

int kvm_cpu_exec(CPUState *cpu)

{

    //...

    do {

        // kvm_vcpu_ioctl(cpu, KVM_RUN, 0)

        // 从这里进入kvm内核阶段,开始运行虚拟机

        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);

        //...

        // 根据退出原因,分发处理

        switch (run->exit_reason) {

        case KVM_EXIT_IO:

            DPRINTF("handle_io\n");

            /* Called outside BQL */

            kvm_handle_io(run->io.port, attrs,

                          (uint8_t *)run + run->io.data_offset,

                          run->io.direction,

                          run->io.size,

                          run->io.count);

            ret = 0;

            break;

        default:

            DPRINTF("kvm_arch_handle_exit\n");

            ret = kvm_arch_handle_exit(cpu, run);

            break;

        }

    } while (ret == 0);

    cpu_exec_end(cpu);

    //...

    qatomic_set(&cpu->exit_request, 0);

    return ret;

}

虚拟机的运行就是kvm_cpu_exec中的do()while(ret == 0)的循环,该循环体中主要通过KVM_RUN启动虚拟机,进入了kvm的内核处理阶段,并等待返回结果。
当虚拟机退出,会根据返回的原因进行相应处理,最后将处理结果返回。
而kvm_cpu_exec自身也处于vcpu线程函数kvm_vcpu_thread_fn的循环当中,所以虚拟机的运行就是在这两个循环中不断进行。

解析参数,创建虚拟机,创建虚拟cpu,并获取三个最主要的描述符kvmfd、vmfd以及vcpufd。
根据vcpu数量创建具体的执行线程。
在线程中通过KVM_RUN启动虚拟机,进入内核KVM的处理流程。
重复循环KVM_RUN阶段。

linux-5.15.39/virt/kvm/kvm_main.c,line 3764
在kvm_main.c文件3764行找到内核中实际的kvm_vcpu_ioctl函数。

那调用流程就是
kvm_vcpu_ioctl --> kvm_arch_vcpu_ioctl_run
--> vcpu_run --> vcpu_enter_guest
--> static_call(kvm_x86_run)(vcpu)

在arch/x86/kvm/vmx/vmx.c line 7584
定义了一系列架构相关的操作函数
关注退出处理相关的
.handle_exit = vmx_handle_exit,

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)

{

    int ret = __vmx_handle_exit(vcpu, exit_fastpath);

    //...

    return ret;

}

static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)

{

    //...

    exit_handler_index = array_index_nospec((u16)exit_reason.basic,

                        kvm_vmx_max_exit_handlers);

    return kvm_vmx_exit_handlers[exit_handler_index](vcpu);

}

 // 退出处理例程返回<=0,表示异常需要到用户层qemu进行进一步处理

 // 退出处理例程返回值>0,表示内核层已经处理完,可继续执行

static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {

    [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,

    [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,

    [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,

    [EXIT_REASON_NMI_WINDOW]          = handle_nmi_window,

    [EXIT_REASON_IO_INSTRUCTION]          = handle_io,

    [EXIT_REASON_CR_ACCESS]               = handle_cr,

    [EXIT_REASON_DR_ACCESS]               = handle_dr,

    [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,

    [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,

    [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,

    [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,

    [EXIT_REASON_HLT]                     = kvm_emulate_halt,

    [EXIT_REASON_INVD]              = kvm_emulate_invd,

    [EXIT_REASON_INVLPG]              = handle_invlpg,

    [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,

    [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,

    [EXIT_REASON_VMCLEAR]              = handle_vmx_instruction,

    [EXIT_REASON_VMLAUNCH]              = handle_vmx_instruction,

    [EXIT_REASON_VMPTRLD]              = handle_vmx_instruction,

    [EXIT_REASON_VMPTRST]              = handle_vmx_instruction,

    [EXIT_REASON_VMREAD]              = handle_vmx_instruction,

    [EXIT_REASON_VMRESUME]              = handle_vmx_instruction,

    [EXIT_REASON_VMWRITE]              = handle_vmx_instruction,

    [EXIT_REASON_VMOFF]              = handle_vmx_instruction,

    [EXIT_REASON_VMON]              = handle_vmx_instruction,

    [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,

    [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,

    [EXIT_REASON_APIC_WRITE]              = handle_apic_write,

    [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,

    [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,

    [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,

    [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,

    [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,

    [EXIT_REASON_GDTR_IDTR]              = handle_desc,

    [EXIT_REASON_LDTR_TR]              = handle_desc,

    [EXIT_REASON_EPT_VIOLATION]          = handle_ept_violation,

    [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,

    [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,

    [EXIT_REASON_MWAIT_INSTRUCTION]          = kvm_emulate_mwait,

    [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,

    [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,

    [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,

    [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,

    [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,

    [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,

    [EXIT_REASON_PML_FULL]              = handle_pml_full,

    [EXIT_REASON_INVPCID]                 = handle_invpcid,

    [EXIT_REASON_VMFUNC]              = handle_vmx_instruction,

    [EXIT_REASON_PREEMPTION_TIMER]          = handle_preemption_timer,

    [EXIT_REASON_ENCLS]              = handle_encls,

    [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,

};

进入guest世界的准备工作。
正式进入guest执行。
根据guest退出原因进行处理,KVM先自行处理,
若kvm不能完全处理,则返回到用户层由QEMU处理。
QEMU处理后再次通过KVM_RUN进入到内核KVM流程。

.《Intel® Volume 3 System Programming Guide》
.《系统虚拟化:原理与实现》
.《处理器虚拟化技术》
. https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.15.39.tar.xz
. https://download.qemu.org/qemu-6.2.0.tar.xz


文章来源: https://bbs.pediy.com/thread-274308.htm
如有侵权请联系:admin#unsafe.sh