qemu live migration代码分析

2021-02-24 11:23:18 浏览数 (1)

研究热迁移了是为了解决热迁移慢和迁移经常失败的问题,物理机升级内核时需要把上面的虚拟机一台台迁移走,很慢很耗时,有时还提示迁移失败。

热迁移虚拟机还是要短暂停止的,只是时间很短,影响比较小,给人感觉服务没有中断,记住这点,别被广告误导了。

live migration用法

src启动qemu-kvm增加参数-incoming tcp:0:6666

dst进去qemu monitor执行migrate tcp:$ip:6666,可以用info migrate查看信息

如果是post copy执行migrate_set_capability postcopy-ram on,然后执行migrate_start_postcopy

openstack环境nova-compute通过libvirt操作qemu,可以用virsh qemu-monitor-command domain --hmp command执行

live migration原理

qemu中有两个概念save_vm和load_vm,migration和snaptshot等都用到。save_vm把qemu cpu state,mem,device state保存在一个fd,这个fd可以本地文件也可以是socket等。load_vm正好相反,把保存的信息恢复到虚拟机。热迁移就是在dst启动一个虚拟机,把src虚拟机的发送过来的状态都恢复到对应位置。因为mem比较大,发送时间长,根据发送时机不同分为pre_copy和post_copy。pre_copy先发送mem,达到一定阈值,停止src虚拟机运行,发送cpu state和device state,dst收到,然后运行。post_copy先发送cpu state和device state,停止src虚拟机运行,dst标志page都无效,开始运行,qemu捕捉pagefault,然后从src请求page,src把page加入发送队列,dst等到这个page通知内核处理pagefualt然后继续运行。

postcopy模式src背后还是一直默默给dst发送page的,只是dst等不着了一些page时插个队,优先发送要的page。总体来说,live migration状态多,线程多(数据传递加保护,同步),和kvm交互多(log dirty page和userfault),容易出错,可优化地方多。

基础

迁移要处理cpu state, ram和device state。cpu就一堆register和stack什么,VMCS定义的那些状态。ram是大头,包括ROM,PCI mem和DIMM等,要求按page访问的。device就多了,有寄存器,队列等,千差万别,肯定得自己实现save和load函数,然后register给migration流程。

代码语言:javascript复制
typedef struct SaveStateEntry {
    QTAILQ_ENTRY(SaveStateEntry) entry;
    char idstr[256];
    int instance_id;
    int alias_id;
    int version_id;
    /* version id read from the stream */
    int load_version_id;
    int section_id;
    /* section id read from the stream */
    int load_section_id;
    SaveVMHandlers *ops;
    const VMStateDescription *vmsd;
    void *opaque;
    CompatEntry *compat;
    int is_ram;
} SaveStateEntry;

typedef struct SaveState {
    QTAILQ_HEAD(, SaveStateEntry) handlers;
    int global_section_id;
    uint32_t len;
    const char *name;
    uint32_t target_page_bits;
} SaveState;

static SaveState savevm_state = {
    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
    .global_section_id = 0,
};
typedef struct SaveVMHandlers {
    /* This runs inside the iothread lock.  */
    SaveStateHandler *save_state;

    void (*save_cleanup)(void *opaque);
    int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);
    int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);

    /* This runs both outside and inside the iothread lock.  */
    bool (*is_active)(void *opaque);
    bool (*has_postcopy)(void *opaque);

    /* is_active_iterate
     * If it is not NULL then qemu_savevm_state_iterate will skip iteration if
     * it returns false. For example, it is needed for only-postcopy-states,
     * which needs to be handled by qemu_savevm_state_setup and
     * qemu_savevm_state_pending, but do not need iterations until not in
     * postcopy stage.
     */
    bool (*is_active_iterate)(void *opaque);

    /* This runs outside the iothread lock in the migration case, and
     * within the lock in the savevm case.  The callback had better only
     * use data that is local to the migration thread or protected
     * by other locks.
     */
    int (*save_live_iterate)(QEMUFile *f, void *opaque);

    /* This runs outside the iothread lock!  */
    int (*save_setup)(QEMUFile *f, void *opaque);
    void (*save_live_pending)(QEMUFile *f, void *opaque,
                              uint64_t threshold_size,
                              uint64_t *res_precopy_only,
                              uint64_t *res_compatible,
                              uint64_t *res_postcopy_only);
    /* Note for save_live_pending:
     * - res_precopy_only is for data which must be migrated in precopy phase
     *     or in stopped state, in other words - before target vm start
     * - res_compatible is for data which may be migrated in any phase
     * - res_postcopy_only is for data which must be migrated in postcopy phase
     *     or in stopped state, in other words - after source vm stop
     *
     * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
     * whole amount of pending data.
     */


    LoadStateHandler *load_state;
    int (*load_setup)(QEMUFile *f, void *opaque);
    int (*load_cleanup)(void *opaque);
} SaveVMHandlers;

static SaveVMHandlers savevm_ram_handlers = {
    .save_setup = ram_save_setup,
    .save_live_iterate = ram_save_iterate,
    .save_live_complete_postcopy = ram_save_complete,
    .save_live_complete_precopy = ram_save_complete,
    .has_postcopy = ram_has_postcopy,
    .save_live_pending = ram_save_pending,
    .load_state = ram_load,
    .save_cleanup = ram_save_cleanup,
    .load_setup = ram_load_setup,
    .load_cleanup = ram_load_cleanup,
};

void ram_mig_init(void)
{
    qemu_mutex_init(&XBZRLE.lock);
    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
}
struct VMStateField {
    const char *name;
    const char *err_hint;
    size_t offset;
    size_t size;
    size_t start;
    int num;
    size_t num_offset;
    size_t size_offset;
    const VMStateInfo *info;
    enum VMStateFlags flags;
    const VMStateDescription *vmsd;
    int version_id;
    bool (*field_exists)(void *opaque, int version_id);
};

struct VMStateDescription {
    const char *name;
    int unmigratable;
    int version_id;
    int minimum_version_id;
    int minimum_version_id_old;
    MigrationPriority priority;
    LoadStateHandler *load_state_old;
    int (*pre_load)(void *opaque);
    int (*post_load)(void *opaque, int version_id);
    int (*pre_save)(void *opaque);
    bool (*needed)(void *opaque);
    VMStateField *fields;
    const VMStateDescription **subsections;
};

static const VMStateDescription vmstate_e1000;
static void e1000_class_init(ObjectClass *klass, void *data)
{
    dc->vmsd = &vmstate_e1000;
}
int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
                                   const VMStateDescription *vmsd,
                                   void *base, int alias_id,
                                   int required_for_version,
                                   Error **errp);

/* Returns: 0 on success, -1 on failure */
static inline int vmstate_register(DeviceState *dev, int instance_id,
                                   const VMStateDescription *vmsd,
                                   void *opaque)
{
    return vmstate_register_with_alias_id(dev, instance_id, vmsd,
                                          opaque, -1, 0, NULL);
}

全局变量savevm_state是链表,ram和device把实现的save和load函数放在链表节点上,

迁移时遍历链表执行一遍就OK了。ram和device不同,ram用SaveVMHandlers,

device用VMStateDescription,VMStateDescription可以嵌套,实现基本数据类型的save和load操作。

pre_copy

pre_copy先处理ram,开始标志所有ram为dirty page,循环发送ram,同时CPU在执行写ram,每次循环从kvm获取CPU写过的ram,直到达到一个条件,停止CPU,发送剩下的ram,再发送CPU和device state。

代码语言:javascript复制
migrate_fd_connect
{   //创建cleanup bh用于migration结束时,结束时触发执行
    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
    //创建migration工作线程
    qemu_thread_create(migration_thread)
}

migration_thread
{
    qemu_savevm_state_setup
    {
        ram_save_setup
        {
            ram_init_all->ram_init_bitmaps
                          {
                              ram_list_init_bitmaps
                              memory_global_dirty_log_start
                              migration_bitmap_sync->kvm_physical_sync_dirty_bitmap
                          }              
            创建线程compress_threads_save_setup
         }
    }
    while(true) 
    {
        qemu_savevm_state_pending->ram_save_pending->migration_bitmap_sync
        migration_iteration_run
        {
             if(!threshhold)
                 qemu_savevm_state_iterate
                     ram_save_iterate->ram_find_and_save_block
              else
                  qemu_savevm_state_complete_precopy->ram_save_complete
                  //其它设备状态
                  vmstate_save_state
               
        }
    }

    migration_iteration_finish->qemu_bh_schedule(s->cleanup_bh);
}

migrate_fd_cleanup
{
    qemu_savevm_state_cleanup->ran_save_cleanup
    停止线程 migration_thread
}


process_incoming_migration_co
{
    qemu_loadvm_state
     {
          qemu_loadvm_state_setup->ram_load_setup->compress_threads_load_setup
          //创建线程do_data_decompress和compress_threads_save_setup对应
          vmstate_load_state
          qemu_loadvm_state_main
          {
               case:    qemu_loadvm_section_start_full->vmstate_load_state
               case:    qemu_loadvm_section_part_end->vmstate_load_state
          }
          qemu_loadvm_state_cleanup
      }
    process_incoming_migration_bh
}

post_copy

为什么需要postcopy,因为pre_copy有可能无法收敛,虚拟机运行的飞快,不断产生dirty page,fd比较慢发送不完,无法达到预定的条件。postcopy就是先发送cpu和device state,停止执行,再慢慢发送ram,如果dst发现少page,再从src请求这个page。

代码语言:javascript复制
migrate_fd_connect
{   //创建cleanup bh用于migration结束时,结束时触发执行
    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
    /****************************************/
    //如果是postcopy,创建收到page请求处理的线程
    open_return_path_on_source->qemu_thread_create(source_return_path_thread)
    /***************************************/
    //创建migration工作线程
    qemu_thread_create(migration_thread)
}
migration_thread
{
    /******************************/
    qemu_savevm_send_open_return_path
    qemu_savevm_send_ping
    qemu_savevm_send_postcopy_advise
    /******************************/
    qemu_savevm_state_setup
    {
        ram_save_setup
        {
            ram_init_all->ram_init_bitmaps
                          {
                              ram_list_init_bitmaps
                              memory_global_dirty_log_start
                              migration_bitmap_sync->kvm_physical_sync_dirty_bitmap
                          }              
            创建线程compress_threads_save_setup
         }
    }
    while(true) 
    {
        qemu_savevm_state_pending->ram_save_pending->migration_bitmap_sync
        
        migration_iteration_run
        {
             if(!threshhold&!post_copy)
                 /******************************/
                 if (postcopy_start()&&first)
                     return;
                 /*****************************/
                 qemu_savevm_state_iterate
                     ram_save_iterate
              else
                  migration_completion
                  {   if pre_copy
                          qemu_savevm_state_complete_precopy->ram_save_complete
                          //其它设备状态
                          vmstate_save_state
                      /******************************/
                      else if post_copy
                          qemu_savevm_state_complete_postcopy->->ram_save_complete
                      /******************************/
                  }
        }
    }
    migration_iteration_finish->qemu_bh_schedule(s->cleanup_bh);
}

migrate_fd_cleanup
{
    qemu_savevm_state_cleanup->ran_save_cleanup
    停止线程 migration_thread
}


process_incoming_migration_co
{
    qemu_loadvm_state
     {
          qemu_loadvm_state_setup->ram_load_setup->compress_threads_load_setup
          //创建线程do_data_decompress和compress_threads_save_setup对应
          vmstate_load_state
          qemu_loadvm_state_main
          {
               case:    qemu_loadvm_section_start_full->vmstate_load_state
               case:    qemu_loadvm_section_part_end->vmstate_load_state
               /******************************/
               //只有post_copy才执行这个case
               case:    loadvm_process_command
               {
                    case:    loadvm_handle_cmd_packaged
                    {
                        //这儿递归了,只执行前两个case
                        qemu_loadvm_state_main
                    }
                    case:    loadvm_postcopy_handle_advise
                    //用于接收请求返回的page
                    case:    loadvm_postcopy_handle_listen
                    {
                        //从kernel接收pagefault,然后发送src请求page
                        qemu_thread_create(postcopy_ram_fault_thread)
                        ///接收src给的page
                        qemu_thread_create(postcopy_ram_listen_thread)
                    }
                    case:    loadvm_postcopy_handle_run->loadvm_postcopy_handle_run_bh
                    //src启动发送page thread,src修改page然后停止,dst cpu执行
                    case:    loadvm_postcopy_ram_handle_discard
               }
               /******************************/
          }
          qemu_loadvm_state_cleanup
      }
    process_incoming_migration_bh
}
postcopy_ram_listen_thread
{ 
    //只执行前两个case 
    qemu_loadvm_state_main
    qemu_loadvm_state_cleanup
}

post_copy相比pre_copy多了如下过程

src:

代码语言:javascript复制
source_return_path_thread
qemu_savevm_send_open_return_path
qemu_savevm_send_ping
qemu_savevm_send_postcopy_advise
postcopy_start
qemu_savevm_state_complete_postcopy

dst:

代码语言:javascript复制
loadvm_process_command和src同步,src告诉dst进入那一个步骤。
postcopy_ram_fault_thread(利用了USERFAULT机制让qemu知道了pagefault)->migrate_send_rp_req_pages
postcopy_ram_listen_thread
//恢复ram,通过userfaultfd通知内核pagefault处理完成,还有其它fd通知共享内存的其它进程等流程
ram_load->ram_load_postcopy->postcopy_place_page->qemu_ufd_copy_ioctl

pre_copy vs post_copy

pre_copy有可能无法收敛,达到一定时间就失败了,无法收敛是由于CPU dirty page产生过快,有人就想着让CPU执行慢一点。post_copy的问题是一些情况是失败了无法恢复,pre_copy迁移失败,在src节点上照样可以恢复执行,而post_copy在dst上修改了状态,无法再同步给src,失败就恢复不了了。

迁移失败情况

qemu版本不匹配

pre_copy src失败情况:

代码语言:javascript复制
就是fd发送失败
无法收敛导致超时
任何一个device注册的save函数失败
vm stop失败,要写的数据写不到硬盘
迁移过程中把vm paused

pre_copy dst失败情况:

代码语言:javascript复制
从fd接收数据失败
任何一个device的pre_load或者post_load失败,在部分return 0,见过virtio-net报错,backend不支持feature

post_copy src失败情况:

代码语言:javascript复制
包括所有pre_copy src的情况
发送数据长度超过最大长度

post_copy dst失败情况:

代码语言:javascript复制
接收到的command不对,数据长度不对
打开socket失败
不支持post_copy,内核不支持把page fault通知用户态
pagesize不匹配
post_copy有固定的步骤,src给dst同步的步骤顺序不对
notify失败

migration还有很多properties和capabilities,可能打开或者关闭就失败了。

实战

迁移虚拟机失败,日记如下:

代码语言:javascript复制
qemuMonitorIORead:610 : Unable to read from monitor: Connection reset by peer
qemuProcessReportLogError:1912 : internal error: qemu unexpectedly closed the monitor: 2019-12-04 06:50:14.598 0000: Domain id=17 is tainted: host-cpu
qemu-kvm: get_pci_config_device: Bad config data: i=0x34 read: 98 device: 40 cmask: ff wmask: 0 w1cmask:0
qemu-kvm: error while loading state for instance 0x0 of device '0000:00:03.0/virtio-net'
qemu-kvm: load of migration failed: Invalid argument

分析一下打印日记的代码

代码语言:javascript复制
static const VMStateDescription vmstate_virtio_net = {
    .name = "virtio-net",
    .minimum_version_id = VIRTIO_NET_VM_VERSION,
    .version_id = VIRTIO_NET_VM_VERSION,
    .fields = (VMStateField[]) {
        VMSTATE_VIRTIO_DEVICE,
        VMSTATE_END_OF_LIST()
    },
    .pre_save = virtio_net_pre_save,
};
#define VMSTATE_VIRTIO_DEVICE 
    {                                         
        .name = "virtio",                     
        .info = &virtio_vmstate_info,         
        .flags = VMS_SINGLE,                  
    }

const VMStateInfo  virtio_vmstate_info = {
    .name = "virtio",
    .get = virtio_device_get,
    .put = virtio_device_put,
};

vmstate_load_state->(field->info->get)/virtio_device_get->virtio_load->load_config/virtio_pci_load_config->pci_device_load->vmstate_load_state->get_pci_config_device

static int get_pci_config_device(QEMUFile *f, void *pv, size_t size,
                                 VMStateField *field)
{
    PCIDevice *s = container_of(pv, PCIDevice, config);
    PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(s);
    uint8_t *config;
    int i;

    assert(size == pci_config_size(s));
    config = g_malloc(size);

    qemu_get_buffer(f, config, size);
    for (i = 0; i < size;   i) {
        if ((config[i] ^ s->config[i]) &
            s->cmask[i] & ~s->wmask[i] & ~s->w1cmask[i]) {
            error_report("%s: Bad config data: i=0x%x read: %x device: %x "
                         "cmask: %x wmask: %x w1cmask:%x", __func__,
                         i, config[i], s->config[i],
                         s->cmask[i], s->wmask[i], s->w1cmask[i]);
            g_free(config);
            return -EINVAL;
        }
    }
    return 0;
}

基本可以确定是virtio-net-pci config space内容src和dst不相符。qemu版本一样,参数一样,只可能是backend的问题,目前用的virtio-net的backend是内核vhost,virtio feature等需要qemu和内核vhost协商,两台物理机的kernel版本不一样导致的。

总结

page处理代码很多,细节还不太懂,等懂了再写。

开源项目资料少,要给一个人讲懂代码感觉很费劲,不太会写文档,简单记录一个以便加深自己对代码的印象,只能大体上讲个毛皮,大家且看且珍惜,要懂代码还得像追姑娘一样死磨烂缠,没有捷径。

0 人点赞