qemu live migration代码分析

研究热迁移了是为了解决热迁移慢和迁移经常失败的问题，物理机升级内核时需要把上面的虚拟机一台台迁移走，很慢很耗时，有时还提示迁移失败。

热迁移虚拟机还是要短暂停止的，只是时间很短，影响比较小，给人感觉服务没有中断，记住这点，别被广告误导了。

live migration用法

src启动qemu-kvm增加参数-incoming tcp:0:6666

dst进去qemu monitor执行migrate tcp:$ip:6666，可以用info migrate查看信息

如果是post copy执行migrate_set_capability postcopy-ram on，然后执行migrate_start_postcopy

openstack环境nova-compute通过libvirt操作qemu，可以用virsh qemu-monitor-command domain --hmp command执行

live migration原理

qemu中有两个概念save_vm和load_vm，migration和snaptshot等都用到。save_vm把qemu cpu state，mem，device state保存在一个fd，这个fd可以本地文件也可以是socket等。load_vm正好相反，把保存的信息恢复到虚拟机。热迁移就是在dst启动一个虚拟机，把src虚拟机的发送过来的状态都恢复到对应位置。因为mem比较大，发送时间长，根据发送时机不同分为pre_copy和post_copy。pre_copy先发送mem，达到一定阈值，停止src虚拟机运行，发送cpu state和device state，dst收到，然后运行。post_copy先发送cpu state和device state，停止src虚拟机运行，dst标志page都无效，开始运行，qemu捕捉pagefault，然后从src请求page，src把page加入发送队列，dst等到这个page通知内核处理pagefualt然后继续运行。

postcopy模式src背后还是一直默默给dst发送page的，只是dst等不着了一些page时插个队，优先发送要的page。总体来说，live migration状态多，线程多(数据传递加保护，同步)，和kvm交互多(log dirty page和userfault)，容易出错，可优化地方多。

基础

迁移要处理cpu state, ram和device state。cpu就一堆register和stack什么，VMCS定义的那些状态。ram是大头，包括ROM，PCI mem和DIMM等，要求按page访问的。device就多了，有寄存器，队列等，千差万别，肯定得自己实现save和load函数，然后register给migration流程。

代码语言：javascript复制

typedef struct SaveStateEntry {
    QTAILQ_ENTRY(SaveStateEntry) entry;
    char idstr[256];
    int instance_id;
    int alias_id;
    int version_id;
    /* version id read from the stream */
    int load_version_id;
    int section_id;
    /* section id read from the stream */
    int load_section_id;
    SaveVMHandlers *ops;
    const VMStateDescription *vmsd;
    void *opaque;
    CompatEntry *compat;
    int is_ram;
} SaveStateEntry;

typedef struct SaveState {
    QTAILQ_HEAD(, SaveStateEntry) handlers;
    int global_section_id;
    uint32_t len;
    const char *name;
    uint32_t target_page_bits;
} SaveState;

static SaveState savevm_state = {
    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
    .global_section_id = 0,
};
typedef struct SaveVMHandlers {
    /* This runs inside the iothread lock.  */
    SaveStateHandler *save_state;

    void (*save_cleanup)(void *opaque);
    int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);
    int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);

    /* This runs both outside and inside the iothread lock.  */
    bool (*is_active)(void *opaque);
    bool (*has_postcopy)(void *opaque);

    /* is_active_iterate
     * If it is not NULL then qemu_savevm_state_iterate will skip iteration if
     * it returns false. For example, it is needed for only-postcopy-states,
     * which needs to be handled by qemu_savevm_state_setup and
     * qemu_savevm_state_pending, but do not need iterations until not in
     * postcopy stage.
     */
    bool (*is_active_iterate)(void *opaque);

    /* This runs outside the iothread lock in the migration case, and
     * within the lock in the savevm case.  The callback had better only
     * use data that is local to the migration thread or protected
     * by other locks.
     */
    int (*save_live_iterate)(QEMUFile *f, void *opaque);

    /* This runs outside the iothread lock!  */
    int (*save_setup)(QEMUFile *f, void *opaque);
    void (*save_live_pending)(QEMUFile *f, void *opaque,
                              uint64_t threshold_size,
                              uint64_t *res_precopy_only,
                              uint64_t *res_compatible,
                              uint64_t *res_postcopy_only);
    /* Note for save_live_pending:
     * - res_precopy_only is for data which must be migrated in precopy phase
     *     or in stopped state, in other words - before target vm start
     * - res_compatible is for data which may be migrated in any phase
     * - res_postcopy_only is for data which must be migrated in postcopy phase
     *     or in stopped state, in other words - after source vm stop
     *
     * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
     * whole amount of pending data.
     */


    LoadStateHandler *load_state;
    int (*load_setup)(QEMUFile *f, void *opaque);
    int (*load_cleanup)(void *opaque);
} SaveVMHandlers;

static SaveVMHandlers savevm_ram_handlers = {
    .save_setup = ram_save_setup,
    .save_live_iterate = ram_save_iterate,
    .save_live_complete_postcopy = ram_save_complete,
    .save_live_complete_precopy = ram_save_complete,
    .has_postcopy = ram_has_postcopy,
    .save_live_pending = ram_save_pending,
    .load_state = ram_load,
    .save_cleanup = ram_save_cleanup,
    .load_setup = ram_load_setup,
    .load_cleanup = ram_load_cleanup,
};

void ram_mig_init(void)
{
    qemu_mutex_init(&XBZRLE.lock);
    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
}
struct VMStateField {
    const char *name;
    const char *err_hint;
    size_t offset;
    size_t size;
    size_t start;
    int num;
    size_t num_offset;
    size_t size_offset;
    const VMStateInfo *info;
    enum VMStateFlags flags;
    const VMStateDescription *vmsd;
    int version_id;
    bool (*field_exists)(void *opaque, int version_id);
};

struct VMStateDescription {
    const char *name;
    int unmigratable;
    int version_id;
    int minimum_version_id;
    int minimum_version_id_old;
    MigrationPriority priority;
    LoadStateHandler *load_state_old;
    int (*pre_load)(void *opaque);
    int (*post_load)(void *opaque, int version_id);
    int (*pre_save)(void *opaque);
    bool (*needed)(void *opaque);
    VMStateField *fields;
    const VMStateDescription **subsections;
};

static const VMStateDescription vmstate_e1000；
static void e1000_class_init(ObjectClass *klass, void *data)
{
    dc->vmsd = &vmstate_e1000;
}
int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
                                   const VMStateDescription *vmsd,
                                   void *base, int alias_id,
                                   int required_for_version,
                                   Error **errp);

/* Returns: 0 on success, -1 on failure */
static inline int vmstate_register(DeviceState *dev, int instance_id,
                                   const VMStateDescription *vmsd,
                                   void *opaque)
{
    return vmstate_register_with_alias_id(dev, instance_id, vmsd,
                                          opaque, -1, 0, NULL);
}

全局变量savevm_state是链表，ram和device把实现的save和load函数放在链表节点上，

迁移时遍历链表执行一遍就OK了。ram和device不同，ram用SaveVMHandlers，

device用VMStateDescription，VMStateDescription可以嵌套，实现基本数据类型的save和load操作。

pre_copy

pre_copy先处理ram，开始标志所有ram为dirty page，循环发送ram，同时CPU在执行写ram，每次循环从kvm获取CPU写过的ram，直到达到一个条件，停止CPU，发送剩下的ram，再发送CPU和device state。

代码语言：javascript复制

migrate_fd_connect
{   //创建cleanup bh用于migration结束时，结束时触发执行
    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
    //创建migration工作线程
    qemu_thread_create(migration_thread)
}

migration_thread
{
    qemu_savevm_state_setup
    {
        ram_save_setup
        {
            ram_init_all->ram_init_bitmaps
                          {
                              ram_list_init_bitmaps
                              memory_global_dirty_log_start
                              migration_bitmap_sync->kvm_physical_sync_dirty_bitmap
                          }              
            创建线程compress_threads_save_setup
         }
    }
    while(true) 
    {
        qemu_savevm_state_pending->ram_save_pending->migration_bitmap_sync
        migration_iteration_run
        {
             if(!threshhold)
                 qemu_savevm_state_iterate
                     ram_save_iterate->ram_find_and_save_block
              else
                  qemu_savevm_state_complete_precopy->ram_save_complete
                  //其它设备状态
                  vmstate_save_state
               
        }
    }

    migration_iteration_finish->qemu_bh_schedule(s->cleanup_bh);
}

migrate_fd_cleanup
{
    qemu_savevm_state_cleanup->ran_save_cleanup
    停止线程 migration_thread
}


process_incoming_migration_co
{
    qemu_loadvm_state
     {
          qemu_loadvm_state_setup->ram_load_setup->compress_threads_load_setup
          //创建线程do_data_decompress和compress_threads_save_setup对应
          vmstate_load_state
          qemu_loadvm_state_main
          {
               case:    qemu_loadvm_section_start_full->vmstate_load_state
               case:    qemu_loadvm_section_part_end->vmstate_load_state
          }
          qemu_loadvm_state_cleanup
      }
    process_incoming_migration_bh
}

post_copy

为什么需要postcopy，因为pre_copy有可能无法收敛，虚拟机运行的飞快，不断产生dirty page，fd比较慢发送不完，无法达到预定的条件。postcopy就是先发送cpu和device state，停止执行，再慢慢发送ram，如果dst发现少page，再从src请求这个page。

代码语言：javascript复制

migrate_fd_connect
{   //创建cleanup bh用于migration结束时，结束时触发执行
    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
    /****************************************/
    //如果是postcopy，创建收到page请求处理的线程
    open_return_path_on_source->qemu_thread_create(source_return_path_thread)
    /***************************************/
    //创建migration工作线程
    qemu_thread_create(migration_thread)
}
migration_thread
{
    /******************************/
    qemu_savevm_send_open_return_path
    qemu_savevm_send_ping
    qemu_savevm_send_postcopy_advise
    /******************************/
    qemu_savevm_state_setup
    {
        ram_save_setup
        {
            ram_init_all->ram_init_bitmaps
                          {
                              ram_list_init_bitmaps
                              memory_global_dirty_log_start
                              migration_bitmap_sync->kvm_physical_sync_dirty_bitmap
                          }              
            创建线程compress_threads_save_setup
         }
    }
    while(true) 
    {
        qemu_savevm_state_pending->ram_save_pending->migration_bitmap_sync
        
        migration_iteration_run
        {
             if(!threshhold&!post_copy)
                 /******************************/
                 if (postcopy_start()&&first)
                     return;
                 /*****************************/
                 qemu_savevm_state_iterate
                     ram_save_iterate
              else
                  migration_completion
                  {   if pre_copy
                          qemu_savevm_state_complete_precopy->ram_save_complete
                          //其它设备状态
                          vmstate_save_state
                      /******************************/
                      else if post_copy
                          qemu_savevm_state_complete_postcopy->->ram_save_complete
                      /******************************/
                  }
        }
    }
    migration_iteration_finish->qemu_bh_schedule(s->cleanup_bh);
}

migrate_fd_cleanup
{
    qemu_savevm_state_cleanup->ran_save_cleanup
    停止线程 migration_thread
}


process_incoming_migration_co
{
    qemu_loadvm_state
     {
          qemu_loadvm_state_setup->ram_load_setup->compress_threads_load_setup
          //创建线程do_data_decompress和compress_threads_save_setup对应
          vmstate_load_state
          qemu_loadvm_state_main
          {
               case:    qemu_loadvm_section_start_full->vmstate_load_state
               case:    qemu_loadvm_section_part_end->vmstate_load_state
               /******************************/
               //只有post_copy才执行这个case
               case:    loadvm_process_command
               {
                    case:    loadvm_handle_cmd_packaged
                    {
                        //这儿递归了，只执行前两个case
                        qemu_loadvm_state_main
                    }
                    case:    loadvm_postcopy_handle_advise
                    //用于接收请求返回的page
                    case:    loadvm_postcopy_handle_listen
                    {
                        //从kernel接收pagefault，然后发送src请求page
                        qemu_thread_create(postcopy_ram_fault_thread)
                        ///接收src给的page
                        qemu_thread_create(postcopy_ram_listen_thread)
                    }
                    case:    loadvm_postcopy_handle_run->loadvm_postcopy_handle_run_bh
                    //src启动发送page thread，src修改page然后停止，dst cpu执行
                    case:    loadvm_postcopy_ram_handle_discard
               }
               /******************************/
          }
          qemu_loadvm_state_cleanup
      }
    process_incoming_migration_bh
}
postcopy_ram_listen_thread
{ 
    //只执行前两个case 
    qemu_loadvm_state_main
    qemu_loadvm_state_cleanup
}

post_copy相比pre_copy多了如下过程

src:

代码语言：javascript复制

source_return_path_thread
qemu_savevm_send_open_return_path
qemu_savevm_send_ping
qemu_savevm_send_postcopy_advise
postcopy_start
qemu_savevm_state_complete_postcopy

dst:

代码语言：javascript复制

loadvm_process_command和src同步，src告诉dst进入那一个步骤。
postcopy_ram_fault_thread(利用了USERFAULT机制让qemu知道了pagefault)->migrate_send_rp_req_pages
postcopy_ram_listen_thread
//恢复ram，通过userfaultfd通知内核pagefault处理完成，还有其它fd通知共享内存的其它进程等流程
ram_load->ram_load_postcopy->postcopy_place_page->qemu_ufd_copy_ioctl

pre_copy vs post_copy

pre_copy有可能无法收敛，达到一定时间就失败了，无法收敛是由于CPU dirty page产生过快，有人就想着让CPU执行慢一点。post_copy的问题是一些情况是失败了无法恢复，pre_copy迁移失败，在src节点上照样可以恢复执行，而post_copy在dst上修改了状态，无法再同步给src，失败就恢复不了了。

迁移失败情况

qemu版本不匹配

pre_copy src失败情况：

代码语言：javascript复制

就是fd发送失败
无法收敛导致超时
任何一个device注册的save函数失败
vm stop失败，要写的数据写不到硬盘
迁移过程中把vm paused

pre_copy dst失败情况：

代码语言：javascript复制

从fd接收数据失败
任何一个device的pre_load或者post_load失败，在部分return 0，见过virtio-net报错，backend不支持feature

post_copy src失败情况：

代码语言：javascript复制

包括所有pre_copy src的情况
发送数据长度超过最大长度

post_copy dst失败情况：

代码语言：javascript复制

接收到的command不对，数据长度不对
打开socket失败
不支持post_copy，内核不支持把page fault通知用户态
pagesize不匹配
post_copy有固定的步骤，src给dst同步的步骤顺序不对
notify失败

migration还有很多properties和capabilities，可能打开或者关闭就失败了。

实战

迁移虚拟机失败，日记如下：

代码语言：javascript复制

qemuMonitorIORead:610 : Unable to read from monitor: Connection reset by peer
qemuProcessReportLogError:1912 : internal error: qemu unexpectedly closed the monitor: 2019-12-04 06:50:14.598 0000: Domain id=17 is tainted: host-cpu
qemu-kvm: get_pci_config_device: Bad config data: i=0x34 read: 98 device: 40 cmask: ff wmask: 0 w1cmask:0
qemu-kvm: error while loading state for instance 0x0 of device '0000:00:03.0/virtio-net'
qemu-kvm: load of migration failed: Invalid argument

分析一下打印日记的代码

代码语言：javascript复制

static const VMStateDescription vmstate_virtio_net = {
    .name = "virtio-net",
    .minimum_version_id = VIRTIO_NET_VM_VERSION,
    .version_id = VIRTIO_NET_VM_VERSION,
    .fields = (VMStateField[]) {
        VMSTATE_VIRTIO_DEVICE,
        VMSTATE_END_OF_LIST()
    },
    .pre_save = virtio_net_pre_save,
};
#define VMSTATE_VIRTIO_DEVICE 
    {                                         
        .name = "virtio",                     
        .info = &virtio_vmstate_info,         
        .flags = VMS_SINGLE,                  
    }

const VMStateInfo  virtio_vmstate_info = {
    .name = "virtio",
    .get = virtio_device_get,
    .put = virtio_device_put,
};

vmstate_load_state->(field->info->get)/virtio_device_get->virtio_load->load_config/virtio_pci_load_config->pci_device_load->vmstate_load_state->get_pci_config_device

static int get_pci_config_device(QEMUFile *f, void *pv, size_t size,
                                 VMStateField *field)
{
    PCIDevice *s = container_of(pv, PCIDevice, config);
    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(s);
    uint8_t *config;
    int i;

    assert(size == pci_config_size(s));
    config = g_malloc(size);

    qemu_get_buffer(f, config, size);
    for (i = 0; i < size;   i) {
        if ((config[i] ^ s->config[i]) &
            s->cmask[i] & ~s->wmask[i] & ~s->w1cmask[i]) {
            error_report("%s: Bad config data: i=0x%x read: %x device: %x "
                         "cmask: %x wmask: %x w1cmask:%x", __func__,
                         i, config[i], s->config[i],
                         s->cmask[i], s->wmask[i], s->w1cmask[i]);
            g_free(config);
            return -EINVAL;
        }
    }
    return 0;
}

基本可以确定是virtio-net-pci config space内容src和dst不相符。qemu版本一样，参数一样，只可能是backend的问题，目前用的virtio-net的backend是内核vhost，virtio feature等需要qemu和内核vhost协商，两台物理机的kernel版本不一样导致的。

总结

page处理代码很多，细节还不太懂，等懂了再写。

开源项目资料少，要给一个人讲懂代码感觉很费劲，不太会写文档，简单记录一个以便加深自己对代码的印象，只能大体上讲个毛皮，大家且看且珍惜，要懂代码还得像追姑娘一样死磨烂缠，没有捷径。

缓存单片机 http 编程算法 kvm

0 人点赞