migration/rdma.c

   1 /*
   2  * RDMA protocol and interfaces
   3  *
   4  * Copyright IBM, Corp. 2010-2013
   5  * Copyright Red Hat, Inc. 2015-2016
   6  *
   7  * Authors:
   8  *  Michael R. Hines <mrhines@us.ibm.com>
   9  *  Jiuxing Liu <jl@us.ibm.com>
  10  *  Daniel P. Berrange <berrange@redhat.com>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2 or
  13  * later.  See the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include "qemu/osdep.h"
  18 #include "qapi/error.h"
  19 #include "qemu/cutils.h"
  20 #include "exec/target_page.h"
  21 #include "rdma.h"
  22 #include "migration.h"
  23 #include "migration-stats.h"
  24 #include "qemu-file.h"
  25 #include "ram.h"
  26 #include "qemu/error-report.h"
  27 #include "qemu/main-loop.h"
  28 #include "qemu/module.h"
  29 #include "qemu/rcu.h"
  30 #include "qemu/sockets.h"
  31 #include "qemu/bitmap.h"
  32 #include "qemu/coroutine.h"
  33 #include "exec/memory.h"
  34 #include <sys/socket.h>
  35 #include <netdb.h>
  36 #include <arpa/inet.h>
  37 #include <rdma/rdma_cma.h>
  38 #include "trace.h"
  39 #include "qom/object.h"
  40 #include "options.h"
  41 #include <poll.h>
  42
  43 #define ERROR(errp, fmt, ...) \
  44     do { \
  45         if (errp && (*(errp) == NULL)) { \
  46             error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
  47         } \
  48     } while (0)
  49
  50 #define RDMA_RESOLVE_TIMEOUT_MS 10000
  51
  52 /* Do not merge data if larger than this. */
  53 #define RDMA_MERGE_MAX (2 * 1024 * 1024)
  54 #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
  55
  56 #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
  57
  58 /*
  59  * This is only for non-live state being migrated.
  60  * Instead of RDMA_WRITE messages, we use RDMA_SEND
  61  * messages for that state, which requires a different
  62  * delivery design than main memory.
  63  */
  64 #define RDMA_SEND_INCREMENT 32768
  65
  66 /*
  67  * Maximum size infiniband SEND message
  68  */
  69 #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
  70 #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
  71
  72 #define RDMA_CONTROL_VERSION_CURRENT 1
  73 /*
  74  * Capabilities for negotiation.
  75  */
  76 #define RDMA_CAPABILITY_PIN_ALL 0x01
  77
  78 /*
  79  * Add the other flags above to this list of known capabilities
  80  * as they are introduced.
  81  */
  82 static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
  83
  84 /*
  85  * A work request ID is 64-bits and we split up these bits
  86  * into 3 parts:
  87  *
  88  * bits 0-15 : type of control message, 2^16
  89  * bits 16-29: ram block index, 2^14
  90  * bits 30-63: ram block chunk number, 2^34
  91  *
  92  * The last two bit ranges are only used for RDMA writes,
  93  * in order to track their completion and potentially
  94  * also track unregistration status of the message.
  95  */
  96 #define RDMA_WRID_TYPE_SHIFT  0UL
  97 #define RDMA_WRID_BLOCK_SHIFT 16UL
  98 #define RDMA_WRID_CHUNK_SHIFT 30UL
  99
 100 #define RDMA_WRID_TYPE_MASK \
 101     ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
 102
 103 #define RDMA_WRID_BLOCK_MASK \
 104     (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
 105
 106 #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
 107
 108 /*
 109  * RDMA migration protocol:
 110  * 1. RDMA Writes (data messages, i.e. RAM)
 111  * 2. IB Send/Recv (control channel messages)
 112  */
 113 enum {
 114     RDMA_WRID_NONE = 0,
 115     RDMA_WRID_RDMA_WRITE = 1,
 116     RDMA_WRID_SEND_CONTROL = 2000,
 117     RDMA_WRID_RECV_CONTROL = 4000,
 118 };
 119
 120 /*
 121  * Work request IDs for IB SEND messages only (not RDMA writes).
 122  * This is used by the migration protocol to transmit
 123  * control messages (such as device state and registration commands)
 124  *
 125  * We could use more WRs, but we have enough for now.
 126  */
 127 enum {
 128     RDMA_WRID_READY = 0,
 129     RDMA_WRID_DATA,
 130     RDMA_WRID_CONTROL,
 131     RDMA_WRID_MAX,
 132 };
 133
 134 /*
 135  * SEND/RECV IB Control Messages.
 136  */
 137 enum {
 138     RDMA_CONTROL_NONE = 0,
 139     RDMA_CONTROL_ERROR,
 140     RDMA_CONTROL_READY,               /* ready to receive */
 141     RDMA_CONTROL_QEMU_FILE,           /* QEMUFile-transmitted bytes */
 142     RDMA_CONTROL_RAM_BLOCKS_REQUEST,  /* RAMBlock synchronization */
 143     RDMA_CONTROL_RAM_BLOCKS_RESULT,   /* RAMBlock synchronization */
 144     RDMA_CONTROL_COMPRESS,            /* page contains repeat values */
 145     RDMA_CONTROL_REGISTER_REQUEST,    /* dynamic page registration */
 146     RDMA_CONTROL_REGISTER_RESULT,     /* key to use after registration */
 147     RDMA_CONTROL_REGISTER_FINISHED,   /* current iteration finished */
 148     RDMA_CONTROL_UNREGISTER_REQUEST,  /* dynamic UN-registration */
 149     RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
 150 };
 151
 152
 153 /*
 154  * Memory and MR structures used to represent an IB Send/Recv work request.
 155  * This is *not* used for RDMA writes, only IB Send/Recv.
 156  */
 157 typedef struct {
 158     uint8_t  control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
 159     struct   ibv_mr *control_mr;               /* registration metadata */
 160     size_t   control_len;                      /* length of the message */
 161     uint8_t *control_curr;                     /* start of unconsumed bytes */
 162 } RDMAWorkRequestData;
 163
 164 /*
 165  * Negotiate RDMA capabilities during connection-setup time.
 166  */
 167 typedef struct {
 168     uint32_t version;
 169     uint32_t flags;
 170 } RDMACapabilities;
 171
 172 static void caps_to_network(RDMACapabilities *cap)
 173 {
 174     cap->version = htonl(cap->version);
 175     cap->flags = htonl(cap->flags);
 176 }
 177
 178 static void network_to_caps(RDMACapabilities *cap)
 179 {
 180     cap->version = ntohl(cap->version);
 181     cap->flags = ntohl(cap->flags);
 182 }
 183
 184 /*
 185  * Representation of a RAMBlock from an RDMA perspective.
 186  * This is not transmitted, only local.
 187  * This and subsequent structures cannot be linked lists
 188  * because we're using a single IB message to transmit
 189  * the information. It's small anyway, so a list is overkill.
 190  */
 191 typedef struct RDMALocalBlock {
 192     char          *block_name;
 193     uint8_t       *local_host_addr; /* local virtual address */
 194     uint64_t       remote_host_addr; /* remote virtual address */
 195     uint64_t       offset;
 196     uint64_t       length;
 197     struct         ibv_mr **pmr;    /* MRs for chunk-level registration */
 198     struct         ibv_mr *mr;      /* MR for non-chunk-level registration */
 199     uint32_t      *remote_keys;     /* rkeys for chunk-level registration */
 200     uint32_t       remote_rkey;     /* rkeys for non-chunk-level registration */
 201     int            index;           /* which block are we */
 202     unsigned int   src_index;       /* (Only used on dest) */
 203     bool           is_ram_block;
 204     int            nb_chunks;
 205     unsigned long *transit_bitmap;
 206     unsigned long *unregister_bitmap;
 207 } RDMALocalBlock;
 208
 209 /*
 210  * Also represents a RAMblock, but only on the dest.
 211  * This gets transmitted by the dest during connection-time
 212  * to the source VM and then is used to populate the
 213  * corresponding RDMALocalBlock with
 214  * the information needed to perform the actual RDMA.
 215  */
 216 typedef struct QEMU_PACKED RDMADestBlock {
 217     uint64_t remote_host_addr;
 218     uint64_t offset;
 219     uint64_t length;
 220     uint32_t remote_rkey;
 221     uint32_t padding;
 222 } RDMADestBlock;
 223
 224 static const char *control_desc(unsigned int rdma_control)
 225 {
 226     static const char *strs[] = {
 227         [RDMA_CONTROL_NONE] = "NONE",
 228         [RDMA_CONTROL_ERROR] = "ERROR",
 229         [RDMA_CONTROL_READY] = "READY",
 230         [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
 231         [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
 232         [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
 233         [RDMA_CONTROL_COMPRESS] = "COMPRESS",
 234         [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
 235         [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
 236         [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
 237         [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
 238         [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
 239     };
 240
 241     if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
 242         return "??BAD CONTROL VALUE??";
 243     }
 244
 245     return strs[rdma_control];
 246 }
 247
 248 static uint64_t htonll(uint64_t v)
 249 {
 250     union { uint32_t lv[2]; uint64_t llv; } u;
 251     u.lv[0] = htonl(v >> 32);
 252     u.lv[1] = htonl(v & 0xFFFFFFFFULL);
 253     return u.llv;
 254 }
 255
 256 static uint64_t ntohll(uint64_t v)
 257 {
 258     union { uint32_t lv[2]; uint64_t llv; } u;
 259     u.llv = v;
 260     return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
 261 }
 262
 263 static void dest_block_to_network(RDMADestBlock *db)
 264 {
 265     db->remote_host_addr = htonll(db->remote_host_addr);
 266     db->offset = htonll(db->offset);
 267     db->length = htonll(db->length);
 268     db->remote_rkey = htonl(db->remote_rkey);
 269 }
 270
 271 static void network_to_dest_block(RDMADestBlock *db)
 272 {
 273     db->remote_host_addr = ntohll(db->remote_host_addr);
 274     db->offset = ntohll(db->offset);
 275     db->length = ntohll(db->length);
 276     db->remote_rkey = ntohl(db->remote_rkey);
 277 }
 278
 279 /*
 280  * Virtual address of the above structures used for transmitting
 281  * the RAMBlock descriptions at connection-time.
 282  * This structure is *not* transmitted.
 283  */
 284 typedef struct RDMALocalBlocks {
 285     int nb_blocks;
 286     bool     init;             /* main memory init complete */
 287     RDMALocalBlock *block;
 288 } RDMALocalBlocks;
 289
 290 /*
 291  * Main data structure for RDMA state.
 292  * While there is only one copy of this structure being allocated right now,
 293  * this is the place where one would start if you wanted to consider
 294  * having more than one RDMA connection open at the same time.
 295  */
 296 typedef struct RDMAContext {
 297     char *host;
 298     int port;
 299     char *host_port;
 300
 301     RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
 302
 303     /*
 304      * This is used by *_exchange_send() to figure out whether or not
 305      * the initial "READY" message has already been received or not.
 306      * This is because other functions may potentially poll() and detect
 307      * the READY message before send() does, in which case we need to
 308      * know if it completed.
 309      */
 310     int control_ready_expected;
 311
 312     /* number of outstanding writes */
 313     int nb_sent;
 314
 315     /* store info about current buffer so that we can
 316        merge it with future sends */
 317     uint64_t current_addr;
 318     uint64_t current_length;
 319     /* index of ram block the current buffer belongs to */
 320     int current_index;
 321     /* index of the chunk in the current ram block */
 322     int current_chunk;
 323
 324     bool pin_all;
 325
 326     /*
 327      * infiniband-specific variables for opening the device
 328      * and maintaining connection state and so forth.
 329      *
 330      * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
 331      * cm_id->verbs, cm_id->channel, and cm_id->qp.
 332      */
 333     struct rdma_cm_id *cm_id;               /* connection manager ID */
 334     struct rdma_cm_id *listen_id;
 335     bool connected;
 336
 337     struct ibv_context          *verbs;
 338     struct rdma_event_channel   *channel;
 339     struct ibv_qp *qp;                      /* queue pair */
 340     struct ibv_comp_channel *recv_comp_channel;  /* recv completion channel */
 341     struct ibv_comp_channel *send_comp_channel;  /* send completion channel */
 342     struct ibv_pd *pd;                      /* protection domain */
 343     struct ibv_cq *recv_cq;                 /* recvieve completion queue */
 344     struct ibv_cq *send_cq;                 /* send completion queue */
 345
 346     /*
 347      * If a previous write failed (perhaps because of a failed
 348      * memory registration, then do not attempt any future work
 349      * and remember the error state.
 350      */
 351     bool errored;
 352     bool error_reported;
 353     bool received_error;
 354
 355     /*
 356      * Description of ram blocks used throughout the code.
 357      */
 358     RDMALocalBlocks local_ram_blocks;
 359     RDMADestBlock  *dest_blocks;
 360
 361     /* Index of the next RAMBlock received during block registration */
 362     unsigned int    next_src_index;
 363
 364     /*
 365      * Migration on *destination* started.
 366      * Then use coroutine yield function.
 367      * Source runs in a thread, so we don't care.
 368      */
 369     int migration_started_on_destination;
 370
 371     int total_registrations;
 372     int total_writes;
 373
 374     int unregister_current, unregister_next;
 375     uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
 376
 377     GHashTable *blockmap;
 378
 379     /* the RDMAContext for return path */
 380     struct RDMAContext *return_path;
 381     bool is_return_path;
 382 } RDMAContext;
 383
 384 #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
 385 OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
 386
 387
 388
 389 struct QIOChannelRDMA {
 390     QIOChannel parent;
 391     RDMAContext *rdmain;
 392     RDMAContext *rdmaout;
 393     QEMUFile *file;
 394     bool blocking; /* XXX we don't actually honour this yet */
 395 };
 396
 397 /*
 398  * Main structure for IB Send/Recv control messages.
 399  * This gets prepended at the beginning of every Send/Recv.
 400  */
 401 typedef struct QEMU_PACKED {
 402     uint32_t len;     /* Total length of data portion */
 403     uint32_t type;    /* which control command to perform */
 404     uint32_t repeat;  /* number of commands in data portion of same type */
 405     uint32_t padding;
 406 } RDMAControlHeader;
 407
 408 static void control_to_network(RDMAControlHeader *control)
 409 {
 410     control->type = htonl(control->type);
 411     control->len = htonl(control->len);
 412     control->repeat = htonl(control->repeat);
 413 }
 414
 415 static void network_to_control(RDMAControlHeader *control)
 416 {
 417     control->type = ntohl(control->type);
 418     control->len = ntohl(control->len);
 419     control->repeat = ntohl(control->repeat);
 420 }
 421
 422 /*
 423  * Register a single Chunk.
 424  * Information sent by the source VM to inform the dest
 425  * to register an single chunk of memory before we can perform
 426  * the actual RDMA operation.
 427  */
 428 typedef struct QEMU_PACKED {
 429     union QEMU_PACKED {
 430         uint64_t current_addr;  /* offset into the ram_addr_t space */
 431         uint64_t chunk;         /* chunk to lookup if unregistering */
 432     } key;
 433     uint32_t current_index; /* which ramblock the chunk belongs to */
 434     uint32_t padding;
 435     uint64_t chunks;            /* how many sequential chunks to register */
 436 } RDMARegister;
 437
 438 static bool rdma_errored(RDMAContext *rdma)
 439 {
 440     if (rdma->errored && !rdma->error_reported) {
 441         error_report("RDMA is in an error state waiting migration"
 442                      " to abort!");
 443         rdma->error_reported = true;
 444     }
 445     return rdma->errored;
 446 }
 447
 448 static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
 449 {
 450     RDMALocalBlock *local_block;
 451     local_block  = &rdma->local_ram_blocks.block[reg->current_index];
 452
 453     if (local_block->is_ram_block) {
 454         /*
 455          * current_addr as passed in is an address in the local ram_addr_t
 456          * space, we need to translate this for the destination
 457          */
 458         reg->key.current_addr -= local_block->offset;
 459         reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
 460     }
 461     reg->key.current_addr = htonll(reg->key.current_addr);
 462     reg->current_index = htonl(reg->current_index);
 463     reg->chunks = htonll(reg->chunks);
 464 }
 465
 466 static void network_to_register(RDMARegister *reg)
 467 {
 468     reg->key.current_addr = ntohll(reg->key.current_addr);
 469     reg->current_index = ntohl(reg->current_index);
 470     reg->chunks = ntohll(reg->chunks);
 471 }
 472
 473 typedef struct QEMU_PACKED {
 474     uint32_t value;     /* if zero, we will madvise() */
 475     uint32_t block_idx; /* which ram block index */
 476     uint64_t offset;    /* Address in remote ram_addr_t space */
 477     uint64_t length;    /* length of the chunk */
 478 } RDMACompress;
 479
 480 static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
 481 {
 482     comp->value = htonl(comp->value);
 483     /*
 484      * comp->offset as passed in is an address in the local ram_addr_t
 485      * space, we need to translate this for the destination
 486      */
 487     comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
 488     comp->offset += rdma->dest_blocks[comp->block_idx].offset;
 489     comp->block_idx = htonl(comp->block_idx);
 490     comp->offset = htonll(comp->offset);
 491     comp->length = htonll(comp->length);
 492 }
 493
 494 static void network_to_compress(RDMACompress *comp)
 495 {
 496     comp->value = ntohl(comp->value);
 497     comp->block_idx = ntohl(comp->block_idx);
 498     comp->offset = ntohll(comp->offset);
 499     comp->length = ntohll(comp->length);
 500 }
 501
 502 /*
 503  * The result of the dest's memory registration produces an "rkey"
 504  * which the source VM must reference in order to perform
 505  * the RDMA operation.
 506  */
 507 typedef struct QEMU_PACKED {
 508     uint32_t rkey;
 509     uint32_t padding;
 510     uint64_t host_addr;
 511 } RDMARegisterResult;
 512
 513 static void result_to_network(RDMARegisterResult *result)
 514 {
 515     result->rkey = htonl(result->rkey);
 516     result->host_addr = htonll(result->host_addr);
 517 };
 518
 519 static void network_to_result(RDMARegisterResult *result)
 520 {
 521     result->rkey = ntohl(result->rkey);
 522     result->host_addr = ntohll(result->host_addr);
 523 };
 524
 525 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
 526                                    uint8_t *data, RDMAControlHeader *resp,
 527                                    int *resp_idx,
 528                                    int (*callback)(RDMAContext *rdma));
 529
 530 static inline uint64_t ram_chunk_index(const uint8_t *start,
 531                                        const uint8_t *host)
 532 {
 533     return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
 534 }
 535
 536 static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
 537                                        uint64_t i)
 538 {
 539     return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
 540                                   (i << RDMA_REG_CHUNK_SHIFT));
 541 }
 542
 543 static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
 544                                      uint64_t i)
 545 {
 546     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
 547                                          (1UL << RDMA_REG_CHUNK_SHIFT);
 548
 549     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
 550         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
 551     }
 552
 553     return result;
 554 }
 555
 556 static void rdma_add_block(RDMAContext *rdma, const char *block_name,
 557                            void *host_addr,
 558                            ram_addr_t block_offset, uint64_t length)
 559 {
 560     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 561     RDMALocalBlock *block;
 562     RDMALocalBlock *old = local->block;
 563
 564     local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
 565
 566     if (local->nb_blocks) {
 567         int x;
 568
 569         if (rdma->blockmap) {
 570             for (x = 0; x < local->nb_blocks; x++) {
 571                 g_hash_table_remove(rdma->blockmap,
 572                                     (void *)(uintptr_t)old[x].offset);
 573                 g_hash_table_insert(rdma->blockmap,
 574                                     (void *)(uintptr_t)old[x].offset,
 575                                     &local->block[x]);
 576             }
 577         }
 578         memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
 579         g_free(old);
 580     }
 581
 582     block = &local->block[local->nb_blocks];
 583
 584     block->block_name = g_strdup(block_name);
 585     block->local_host_addr = host_addr;
 586     block->offset = block_offset;
 587     block->length = length;
 588     block->index = local->nb_blocks;
 589     block->src_index = ~0U; /* Filled in by the receipt of the block list */
 590     block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
 591     block->transit_bitmap = bitmap_new(block->nb_chunks);
 592     bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
 593     block->unregister_bitmap = bitmap_new(block->nb_chunks);
 594     bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
 595     block->remote_keys = g_new0(uint32_t, block->nb_chunks);
 596
 597     block->is_ram_block = local->init ? false : true;
 598
 599     if (rdma->blockmap) {
 600         g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
 601     }
 602
 603     trace_rdma_add_block(block_name, local->nb_blocks,
 604                          (uintptr_t) block->local_host_addr,
 605                          block->offset, block->length,
 606                          (uintptr_t) (block->local_host_addr + block->length),
 607                          BITS_TO_LONGS(block->nb_chunks) *
 608                              sizeof(unsigned long) * 8,
 609                          block->nb_chunks);
 610
 611     local->nb_blocks++;
 612 }
 613
 614 /*
 615  * Memory regions need to be registered with the device and queue pairs setup
 616  * in advanced before the migration starts. This tells us where the RAM blocks
 617  * are so that we can register them individually.
 618  */
 619 static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
 620 {
 621     const char *block_name = qemu_ram_get_idstr(rb);
 622     void *host_addr = qemu_ram_get_host_addr(rb);
 623     ram_addr_t block_offset = qemu_ram_get_offset(rb);
 624     ram_addr_t length = qemu_ram_get_used_length(rb);
 625     rdma_add_block(opaque, block_name, host_addr, block_offset, length);
 626     return 0;
 627 }
 628
 629 /*
 630  * Identify the RAMBlocks and their quantity. They will be references to
 631  * identify chunk boundaries inside each RAMBlock and also be referenced
 632  * during dynamic page registration.
 633  */
 634 static void qemu_rdma_init_ram_blocks(RDMAContext *rdma)
 635 {
 636     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 637     int ret;
 638
 639     assert(rdma->blockmap == NULL);
 640     memset(local, 0, sizeof *local);
 641     ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
 642     assert(!ret);
 643     trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
 644     rdma->dest_blocks = g_new0(RDMADestBlock,
 645                                rdma->local_ram_blocks.nb_blocks);
 646     local->init = true;
 647 }
 648
 649 /*
 650  * Note: If used outside of cleanup, the caller must ensure that the destination
 651  * block structures are also updated
 652  */
 653 static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
 654 {
 655     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 656     RDMALocalBlock *old = local->block;
 657     int x;
 658
 659     if (rdma->blockmap) {
 660         g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
 661     }
 662     if (block->pmr) {
 663         int j;
 664
 665         for (j = 0; j < block->nb_chunks; j++) {
 666             if (!block->pmr[j]) {
 667                 continue;
 668             }
 669             ibv_dereg_mr(block->pmr[j]);
 670             rdma->total_registrations--;
 671         }
 672         g_free(block->pmr);
 673         block->pmr = NULL;
 674     }
 675
 676     if (block->mr) {
 677         ibv_dereg_mr(block->mr);
 678         rdma->total_registrations--;
 679         block->mr = NULL;
 680     }
 681
 682     g_free(block->transit_bitmap);
 683     block->transit_bitmap = NULL;
 684
 685     g_free(block->unregister_bitmap);
 686     block->unregister_bitmap = NULL;
 687
 688     g_free(block->remote_keys);
 689     block->remote_keys = NULL;
 690
 691     g_free(block->block_name);
 692     block->block_name = NULL;
 693
 694     if (rdma->blockmap) {
 695         for (x = 0; x < local->nb_blocks; x++) {
 696             g_hash_table_remove(rdma->blockmap,
 697                                 (void *)(uintptr_t)old[x].offset);
 698         }
 699     }
 700
 701     if (local->nb_blocks > 1) {
 702
 703         local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
 704
 705         if (block->index) {
 706             memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
 707         }
 708
 709         if (block->index < (local->nb_blocks - 1)) {
 710             memcpy(local->block + block->index, old + (block->index + 1),
 711                 sizeof(RDMALocalBlock) *
 712                     (local->nb_blocks - (block->index + 1)));
 713             for (x = block->index; x < local->nb_blocks - 1; x++) {
 714                 local->block[x].index--;
 715             }
 716         }
 717     } else {
 718         assert(block == local->block);
 719         local->block = NULL;
 720     }
 721
 722     trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
 723                            block->offset, block->length,
 724                             (uintptr_t)(block->local_host_addr + block->length),
 725                            BITS_TO_LONGS(block->nb_chunks) *
 726                                sizeof(unsigned long) * 8, block->nb_chunks);
 727
 728     g_free(old);
 729
 730     local->nb_blocks--;
 731
 732     if (local->nb_blocks && rdma->blockmap) {
 733         for (x = 0; x < local->nb_blocks; x++) {
 734             g_hash_table_insert(rdma->blockmap,
 735                                 (void *)(uintptr_t)local->block[x].offset,
 736                                 &local->block[x]);
 737         }
 738     }
 739 }
 740
 741 /*
 742  * Put in the log file which RDMA device was opened and the details
 743  * associated with that device.
 744  */
 745 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
 746 {
 747     struct ibv_port_attr port;
 748
 749     if (ibv_query_port(verbs, 1, &port)) {
 750         error_report("Failed to query port information");
 751         return;
 752     }
 753
 754     printf("%s RDMA Device opened: kernel name %s "
 755            "uverbs device name %s, "
 756            "infiniband_verbs class device path %s, "
 757            "infiniband class device path %s, "
 758            "transport: (%d) %s\n",
 759                 who,
 760                 verbs->device->name,
 761                 verbs->device->dev_name,
 762                 verbs->device->dev_path,
 763                 verbs->device->ibdev_path,
 764                 port.link_layer,
 765                 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
 766                  ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
 767                     ? "Ethernet" : "Unknown"));
 768 }
 769
 770 /*
 771  * Put in the log file the RDMA gid addressing information,
 772  * useful for folks who have trouble understanding the
 773  * RDMA device hierarchy in the kernel.
 774  */
 775 static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
 776 {
 777     char sgid[33];
 778     char dgid[33];
 779     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
 780     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
 781     trace_qemu_rdma_dump_gid(who, sgid, dgid);
 782 }
 783
 784 /*
 785  * As of now, IPv6 over RoCE / iWARP is not supported by linux.
 786  * We will try the next addrinfo struct, and fail if there are
 787  * no other valid addresses to bind against.
 788  *
 789  * If user is listening on '[::]', then we will not have a opened a device
 790  * yet and have no way of verifying if the device is RoCE or not.
 791  *
 792  * In this case, the source VM will throw an error for ALL types of
 793  * connections (both IPv4 and IPv6) if the destination machine does not have
 794  * a regular infiniband network available for use.
 795  *
 796  * The only way to guarantee that an error is thrown for broken kernels is
 797  * for the management software to choose a *specific* interface at bind time
 798  * and validate what time of hardware it is.
 799  *
 800  * Unfortunately, this puts the user in a fix:
 801  *
 802  *  If the source VM connects with an IPv4 address without knowing that the
 803  *  destination has bound to '[::]' the migration will unconditionally fail
 804  *  unless the management software is explicitly listening on the IPv4
 805  *  address while using a RoCE-based device.
 806  *
 807  *  If the source VM connects with an IPv6 address, then we're OK because we can
 808  *  throw an error on the source (and similarly on the destination).
 809  *
 810  *  But in mixed environments, this will be broken for a while until it is fixed
 811  *  inside linux.
 812  *
 813  * We do provide a *tiny* bit of help in this function: We can list all of the
 814  * devices in the system and check to see if all the devices are RoCE or
 815  * Infiniband.
 816  *
 817  * If we detect that we have a *pure* RoCE environment, then we can safely
 818  * thrown an error even if the management software has specified '[::]' as the
 819  * bind address.
 820  *
 821  * However, if there is are multiple hetergeneous devices, then we cannot make
 822  * this assumption and the user just has to be sure they know what they are
 823  * doing.
 824  *
 825  * Patches are being reviewed on linux-rdma.
 826  */
 827 static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
 828 {
 829     /* This bug only exists in linux, to our knowledge. */
 830 #ifdef CONFIG_LINUX
 831     struct ibv_port_attr port_attr;
 832
 833     /*
 834      * Verbs are only NULL if management has bound to '[::]'.
 835      *
 836      * Let's iterate through all the devices and see if there any pure IB
 837      * devices (non-ethernet).
 838      *
 839      * If not, then we can safely proceed with the migration.
 840      * Otherwise, there are no guarantees until the bug is fixed in linux.
 841      */
 842     if (!verbs) {
 843         int num_devices, x;
 844         struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
 845         bool roce_found = false;
 846         bool ib_found = false;
 847
 848         for (x = 0; x < num_devices; x++) {
 849             verbs = ibv_open_device(dev_list[x]);
 850             /*
 851              * ibv_open_device() is not documented to set errno.  If
 852              * it does, it's somebody else's doc bug.  If it doesn't,
 853              * the use of errno below is wrong.
 854              * TODO Find out whether ibv_open_device() sets errno.
 855              */
 856             if (!verbs) {
 857                 if (errno == EPERM) {
 858                     continue;
 859                 } else {
 860                     error_setg_errno(errp, errno,
 861                                      "could not open RDMA device context");
 862                     return -1;
 863                 }
 864             }
 865
 866             if (ibv_query_port(verbs, 1, &port_attr)) {
 867                 ibv_close_device(verbs);
 868                 ERROR(errp, "Could not query initial IB port");
 869                 return -1;
 870             }
 871
 872             if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
 873                 ib_found = true;
 874             } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
 875                 roce_found = true;
 876             }
 877
 878             ibv_close_device(verbs);
 879
 880         }
 881
 882         if (roce_found) {
 883             if (ib_found) {
 884                 fprintf(stderr, "WARN: migrations may fail:"
 885                                 " IPv6 over RoCE / iWARP in linux"
 886                                 " is broken. But since you appear to have a"
 887                                 " mixed RoCE / IB environment, be sure to only"
 888                                 " migrate over the IB fabric until the kernel "
 889                                 " fixes the bug.\n");
 890             } else {
 891                 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
 892                             " and your management software has specified '[::]'"
 893                             ", but IPv6 over RoCE / iWARP is not supported in Linux.");
 894                 return -1;
 895             }
 896         }
 897
 898         return 0;
 899     }
 900
 901     /*
 902      * If we have a verbs context, that means that some other than '[::]' was
 903      * used by the management software for binding. In which case we can
 904      * actually warn the user about a potentially broken kernel.
 905      */
 906
 907     /* IB ports start with 1, not 0 */
 908     if (ibv_query_port(verbs, 1, &port_attr)) {
 909         ERROR(errp, "Could not query initial IB port");
 910         return -1;
 911     }
 912
 913     if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
 914         ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
 915                     "(but patches on linux-rdma in progress)");
 916         return -1;
 917     }
 918
 919 #endif
 920
 921     return 0;
 922 }
 923
 924 /*
 925  * Figure out which RDMA device corresponds to the requested IP hostname
 926  * Also create the initial connection manager identifiers for opening
 927  * the connection.
 928  */
 929 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
 930 {
 931     int ret;
 932     struct rdma_addrinfo *res;
 933     char port_str[16];
 934     struct rdma_cm_event *cm_event;
 935     char ip[40] = "unknown";
 936     struct rdma_addrinfo *e;
 937
 938     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
 939         ERROR(errp, "RDMA hostname has not been set");
 940         return -1;
 941     }
 942
 943     /* create CM channel */
 944     rdma->channel = rdma_create_event_channel();
 945     if (!rdma->channel) {
 946         ERROR(errp, "could not create CM channel");
 947         return -1;
 948     }
 949
 950     /* create CM id */
 951     ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
 952     if (ret < 0) {
 953         ERROR(errp, "could not create channel id");
 954         goto err_resolve_create_id;
 955     }
 956
 957     snprintf(port_str, 16, "%d", rdma->port);
 958     port_str[15] = '\0';
 959
 960     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
 961     if (ret) {
 962         ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
 963         goto err_resolve_get_addr;
 964     }
 965
 966     for (e = res; e != NULL; e = e->ai_next) {
 967         inet_ntop(e->ai_family,
 968             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
 969         trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
 970
 971         ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
 972                 RDMA_RESOLVE_TIMEOUT_MS);
 973         if (ret >= 0) {
 974             if (e->ai_family == AF_INET6) {
 975                 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
 976                 if (ret < 0) {
 977                     continue;
 978                 }
 979             }
 980             goto route;
 981         }
 982     }
 983
 984     rdma_freeaddrinfo(res);
 985     ERROR(errp, "could not resolve address %s", rdma->host);
 986     goto err_resolve_get_addr;
 987
 988 route:
 989     rdma_freeaddrinfo(res);
 990     qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
 991
 992     ret = rdma_get_cm_event(rdma->channel, &cm_event);
 993     if (ret < 0) {
 994         ERROR(errp, "could not perform event_addr_resolved");
 995         goto err_resolve_get_addr;
 996     }
 997
 998     if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 999         ERROR(errp, "result not equal to event_addr_resolved %s",
1000                 rdma_event_str(cm_event->event));
1001         error_report("rdma_resolve_addr");
1002         rdma_ack_cm_event(cm_event);
1003         goto err_resolve_get_addr;
1004     }
1005     rdma_ack_cm_event(cm_event);
1006
1007     /* resolve route */
1008     ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1009     if (ret < 0) {
1010         ERROR(errp, "could not resolve rdma route");
1011         goto err_resolve_get_addr;
1012     }
1013
1014     ret = rdma_get_cm_event(rdma->channel, &cm_event);
1015     if (ret < 0) {
1016         ERROR(errp, "could not perform event_route_resolved");
1017         goto err_resolve_get_addr;
1018     }
1019     if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1020         ERROR(errp, "result not equal to event_route_resolved: %s",
1021                         rdma_event_str(cm_event->event));
1022         rdma_ack_cm_event(cm_event);
1023         goto err_resolve_get_addr;
1024     }
1025     rdma_ack_cm_event(cm_event);
1026     rdma->verbs = rdma->cm_id->verbs;
1027     qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1028     qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1029     return 0;
1030
1031 err_resolve_get_addr:
1032     rdma_destroy_id(rdma->cm_id);
1033     rdma->cm_id = NULL;
1034 err_resolve_create_id:
1035     rdma_destroy_event_channel(rdma->channel);
1036     rdma->channel = NULL;
1037     return -1;
1038 }
1039
1040 /*
1041  * Create protection domain and completion queues
1042  */
1043 static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1044 {
1045     /* allocate pd */
1046     rdma->pd = ibv_alloc_pd(rdma->verbs);
1047     if (!rdma->pd) {
1048         error_report("failed to allocate protection domain");
1049         return -1;
1050     }
1051
1052     /* create receive completion channel */
1053     rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1054     if (!rdma->recv_comp_channel) {
1055         error_report("failed to allocate receive completion channel");
1056         goto err_alloc_pd_cq;
1057     }
1058
1059     /*
1060      * Completion queue can be filled by read work requests.
1061      */
1062     rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1063                                   NULL, rdma->recv_comp_channel, 0);
1064     if (!rdma->recv_cq) {
1065         error_report("failed to allocate receive completion queue");
1066         goto err_alloc_pd_cq;
1067     }
1068
1069     /* create send completion channel */
1070     rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1071     if (!rdma->send_comp_channel) {
1072         error_report("failed to allocate send completion channel");
1073         goto err_alloc_pd_cq;
1074     }
1075
1076     rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1077                                   NULL, rdma->send_comp_channel, 0);
1078     if (!rdma->send_cq) {
1079         error_report("failed to allocate send completion queue");
1080         goto err_alloc_pd_cq;
1081     }
1082
1083     return 0;
1084
1085 err_alloc_pd_cq:
1086     if (rdma->pd) {
1087         ibv_dealloc_pd(rdma->pd);
1088     }
1089     if (rdma->recv_comp_channel) {
1090         ibv_destroy_comp_channel(rdma->recv_comp_channel);
1091     }
1092     if (rdma->send_comp_channel) {
1093         ibv_destroy_comp_channel(rdma->send_comp_channel);
1094     }
1095     if (rdma->recv_cq) {
1096         ibv_destroy_cq(rdma->recv_cq);
1097         rdma->recv_cq = NULL;
1098     }
1099     rdma->pd = NULL;
1100     rdma->recv_comp_channel = NULL;
1101     rdma->send_comp_channel = NULL;
1102     return -1;
1103
1104 }
1105
1106 /*
1107  * Create queue pairs.
1108  */
1109 static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1110 {
1111     struct ibv_qp_init_attr attr = { 0 };
1112     int ret;
1113
1114     attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1115     attr.cap.max_recv_wr = 3;
1116     attr.cap.max_send_sge = 1;
1117     attr.cap.max_recv_sge = 1;
1118     attr.send_cq = rdma->send_cq;
1119     attr.recv_cq = rdma->recv_cq;
1120     attr.qp_type = IBV_QPT_RC;
1121
1122     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1123     if (ret < 0) {
1124         return -1;
1125     }
1126
1127     rdma->qp = rdma->cm_id->qp;
1128     return 0;
1129 }
1130
1131 /* Check whether On-Demand Paging is supported by RDAM device */
1132 static bool rdma_support_odp(struct ibv_context *dev)
1133 {
1134     struct ibv_device_attr_ex attr = {0};
1135     int ret = ibv_query_device_ex(dev, NULL, &attr);
1136     if (ret) {
1137         return false;
1138     }
1139
1140     if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1141         return true;
1142     }
1143
1144     return false;
1145 }
1146
1147 /*
1148  * ibv_advise_mr to avoid RNR NAK error as far as possible.
1149  * The responder mr registering with ODP will sent RNR NAK back to
1150  * the requester in the face of the page fault.
1151  */
1152 static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1153                                          uint32_t len,  uint32_t lkey,
1154                                          const char *name, bool wr)
1155 {
1156 #ifdef HAVE_IBV_ADVISE_MR
1157     int ret;
1158     int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1159                  IBV_ADVISE_MR_ADVICE_PREFETCH;
1160     struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1161
1162     ret = ibv_advise_mr(pd, advice,
1163                         IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1164     /* ignore the error */
1165     trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
1166 #endif
1167 }
1168
1169 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1170 {
1171     int i;
1172     RDMALocalBlocks *local = &rdma->local_ram_blocks;
1173
1174     for (i = 0; i < local->nb_blocks; i++) {
1175         int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1176
1177         local->block[i].mr =
1178             ibv_reg_mr(rdma->pd,
1179                     local->block[i].local_host_addr,
1180                     local->block[i].length, access
1181                     );
1182         /*
1183          * ibv_reg_mr() is not documented to set errno.  If it does,
1184          * it's somebody else's doc bug.  If it doesn't, the use of
1185          * errno below is wrong.
1186          * TODO Find out whether ibv_reg_mr() sets errno.
1187          */
1188         if (!local->block[i].mr &&
1189             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1190                 access |= IBV_ACCESS_ON_DEMAND;
1191                 /* register ODP mr */
1192                 local->block[i].mr =
1193                     ibv_reg_mr(rdma->pd,
1194                                local->block[i].local_host_addr,
1195                                local->block[i].length, access);
1196                 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1197
1198                 if (local->block[i].mr) {
1199                     qemu_rdma_advise_prefetch_mr(rdma->pd,
1200                                     (uintptr_t)local->block[i].local_host_addr,
1201                                     local->block[i].length,
1202                                     local->block[i].mr->lkey,
1203                                     local->block[i].block_name,
1204                                     true);
1205                 }
1206         }
1207
1208         if (!local->block[i].mr) {
1209             perror("Failed to register local dest ram block!");
1210             break;
1211         }
1212         rdma->total_registrations++;
1213     }
1214
1215     if (i >= local->nb_blocks) {
1216         return 0;
1217     }
1218
1219     for (i--; i >= 0; i--) {
1220         ibv_dereg_mr(local->block[i].mr);
1221         local->block[i].mr = NULL;
1222         rdma->total_registrations--;
1223     }
1224
1225     return -1;
1226
1227 }
1228
1229 /*
1230  * Find the ram block that corresponds to the page requested to be
1231  * transmitted by QEMU.
1232  *
1233  * Once the block is found, also identify which 'chunk' within that
1234  * block that the page belongs to.
1235  */
1236 static void qemu_rdma_search_ram_block(RDMAContext *rdma,
1237                                        uintptr_t block_offset,
1238                                        uint64_t offset,
1239                                        uint64_t length,
1240                                        uint64_t *block_index,
1241                                        uint64_t *chunk_index)
1242 {
1243     uint64_t current_addr = block_offset + offset;
1244     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1245                                                 (void *) block_offset);
1246     assert(block);
1247     assert(current_addr >= block->offset);
1248     assert((current_addr + length) <= (block->offset + block->length));
1249
1250     *block_index = block->index;
1251     *chunk_index = ram_chunk_index(block->local_host_addr,
1252                 block->local_host_addr + (current_addr - block->offset));
1253 }
1254
1255 /*
1256  * Register a chunk with IB. If the chunk was already registered
1257  * previously, then skip.
1258  *
1259  * Also return the keys associated with the registration needed
1260  * to perform the actual RDMA operation.
1261  */
1262 static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1263         RDMALocalBlock *block, uintptr_t host_addr,
1264         uint32_t *lkey, uint32_t *rkey, int chunk,
1265         uint8_t *chunk_start, uint8_t *chunk_end)
1266 {
1267     if (block->mr) {
1268         if (lkey) {
1269             *lkey = block->mr->lkey;
1270         }
1271         if (rkey) {
1272             *rkey = block->mr->rkey;
1273         }
1274         return 0;
1275     }
1276
1277     /* allocate memory to store chunk MRs */
1278     if (!block->pmr) {
1279         block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1280     }
1281
1282     /*
1283      * If 'rkey', then we're the destination, so grant access to the source.
1284      *
1285      * If 'lkey', then we're the source VM, so grant access only to ourselves.
1286      */
1287     if (!block->pmr[chunk]) {
1288         uint64_t len = chunk_end - chunk_start;
1289         int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1290                      0;
1291
1292         trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1293
1294         block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1295         /*
1296          * ibv_reg_mr() is not documented to set errno.  If it does,
1297          * it's somebody else's doc bug.  If it doesn't, the use of
1298          * errno below is wrong.
1299          * TODO Find out whether ibv_reg_mr() sets errno.
1300          */
1301         if (!block->pmr[chunk] &&
1302             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1303             access |= IBV_ACCESS_ON_DEMAND;
1304             /* register ODP mr */
1305             block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1306             trace_qemu_rdma_register_odp_mr(block->block_name);
1307
1308             if (block->pmr[chunk]) {
1309                 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1310                                             len, block->pmr[chunk]->lkey,
1311                                             block->block_name, rkey);
1312
1313             }
1314         }
1315     }
1316     if (!block->pmr[chunk]) {
1317         perror("Failed to register chunk!");
1318         fprintf(stderr, "Chunk details: block: %d chunk index %d"
1319                         " start %" PRIuPTR " end %" PRIuPTR
1320                         " host %" PRIuPTR
1321                         " local %" PRIuPTR " registrations: %d\n",
1322                         block->index, chunk, (uintptr_t)chunk_start,
1323                         (uintptr_t)chunk_end, host_addr,
1324                         (uintptr_t)block->local_host_addr,
1325                         rdma->total_registrations);
1326         return -1;
1327     }
1328     rdma->total_registrations++;
1329
1330     if (lkey) {
1331         *lkey = block->pmr[chunk]->lkey;
1332     }
1333     if (rkey) {
1334         *rkey = block->pmr[chunk]->rkey;
1335     }
1336     return 0;
1337 }
1338
1339 /*
1340  * Register (at connection time) the memory used for control
1341  * channel messages.
1342  */
1343 static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1344 {
1345     rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1346             rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1347             IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1348     if (rdma->wr_data[idx].control_mr) {
1349         rdma->total_registrations++;
1350         return 0;
1351     }
1352     error_report("qemu_rdma_reg_control failed");
1353     return -1;
1354 }
1355
1356 /*
1357  * Perform a non-optimized memory unregistration after every transfer
1358  * for demonstration purposes, only if pin-all is not requested.
1359  *
1360  * Potential optimizations:
1361  * 1. Start a new thread to run this function continuously
1362         - for bit clearing
1363         - and for receipt of unregister messages
1364  * 2. Use an LRU.
1365  * 3. Use workload hints.
1366  */
1367 static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1368 {
1369     while (rdma->unregistrations[rdma->unregister_current]) {
1370         int ret;
1371         uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1372         uint64_t chunk =
1373             (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1374         uint64_t index =
1375             (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1376         RDMALocalBlock *block =
1377             &(rdma->local_ram_blocks.block[index]);
1378         RDMARegister reg = { .current_index = index };
1379         RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1380                                  };
1381         RDMAControlHeader head = { .len = sizeof(RDMARegister),
1382                                    .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1383                                    .repeat = 1,
1384                                  };
1385
1386         trace_qemu_rdma_unregister_waiting_proc(chunk,
1387                                                 rdma->unregister_current);
1388
1389         rdma->unregistrations[rdma->unregister_current] = 0;
1390         rdma->unregister_current++;
1391
1392         if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1393             rdma->unregister_current = 0;
1394         }
1395
1396
1397         /*
1398          * Unregistration is speculative (because migration is single-threaded
1399          * and we cannot break the protocol's inifinband message ordering).
1400          * Thus, if the memory is currently being used for transmission,
1401          * then abort the attempt to unregister and try again
1402          * later the next time a completion is received for this memory.
1403          */
1404         clear_bit(chunk, block->unregister_bitmap);
1405
1406         if (test_bit(chunk, block->transit_bitmap)) {
1407             trace_qemu_rdma_unregister_waiting_inflight(chunk);
1408             continue;
1409         }
1410
1411         trace_qemu_rdma_unregister_waiting_send(chunk);
1412
1413         ret = ibv_dereg_mr(block->pmr[chunk]);
1414         block->pmr[chunk] = NULL;
1415         block->remote_keys[chunk] = 0;
1416
1417         if (ret != 0) {
1418             /*
1419              * FIXME perror() is problematic, bcause ibv_dereg_mr() is
1420              * not documented to set errno.  Will go away later in
1421              * this series.
1422              */
1423             perror("unregistration chunk failed");
1424             return -1;
1425         }
1426         rdma->total_registrations--;
1427
1428         reg.key.chunk = chunk;
1429         register_to_network(rdma, &reg);
1430         ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1431                                 &resp, NULL, NULL);
1432         if (ret < 0) {
1433             return -1;
1434         }
1435
1436         trace_qemu_rdma_unregister_waiting_complete(chunk);
1437     }
1438
1439     return 0;
1440 }
1441
1442 static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1443                                          uint64_t chunk)
1444 {
1445     uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1446
1447     result |= (index << RDMA_WRID_BLOCK_SHIFT);
1448     result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1449
1450     return result;
1451 }
1452
1453 /*
1454  * Consult the connection manager to see a work request
1455  * (of any kind) has completed.
1456  * Return the work request ID that completed.
1457  */
1458 static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1459                           uint64_t *wr_id_out, uint32_t *byte_len)
1460 {
1461     int ret;
1462     struct ibv_wc wc;
1463     uint64_t wr_id;
1464
1465     ret = ibv_poll_cq(cq, 1, &wc);
1466
1467     if (!ret) {
1468         *wr_id_out = RDMA_WRID_NONE;
1469         return 0;
1470     }
1471
1472     if (ret < 0) {
1473         error_report("ibv_poll_cq failed");
1474         return -1;
1475     }
1476
1477     wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1478
1479     if (wc.status != IBV_WC_SUCCESS) {
1480         fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1481                         wc.status, ibv_wc_status_str(wc.status));
1482         fprintf(stderr, "ibv_poll_cq wrid=%" PRIu64 "!\n", wr_id);
1483
1484         return -1;
1485     }
1486
1487     if (rdma->control_ready_expected &&
1488         (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1489         trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id,
1490                                   rdma->nb_sent);
1491         rdma->control_ready_expected = 0;
1492     }
1493
1494     if (wr_id == RDMA_WRID_RDMA_WRITE) {
1495         uint64_t chunk =
1496             (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1497         uint64_t index =
1498             (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1499         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1500
1501         trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent,
1502                                    index, chunk, block->local_host_addr,
1503                                    (void *)(uintptr_t)block->remote_host_addr);
1504
1505         clear_bit(chunk, block->transit_bitmap);
1506
1507         if (rdma->nb_sent > 0) {
1508             rdma->nb_sent--;
1509         }
1510     } else {
1511         trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent);
1512     }
1513
1514     *wr_id_out = wc.wr_id;
1515     if (byte_len) {
1516         *byte_len = wc.byte_len;
1517     }
1518
1519     return  0;
1520 }
1521
1522 /* Wait for activity on the completion channel.
1523  * Returns 0 on success, none-0 on error.
1524  */
1525 static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1526                                        struct ibv_comp_channel *comp_channel)
1527 {
1528     struct rdma_cm_event *cm_event;
1529     int ret;
1530
1531     /*
1532      * Coroutine doesn't start until migration_fd_process_incoming()
1533      * so don't yield unless we know we're running inside of a coroutine.
1534      */
1535     if (rdma->migration_started_on_destination &&
1536         migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1537         yield_until_fd_readable(comp_channel->fd);
1538     } else {
1539         /* This is the source side, we're in a separate thread
1540          * or destination prior to migration_fd_process_incoming()
1541          * after postcopy, the destination also in a separate thread.
1542          * we can't yield; so we have to poll the fd.
1543          * But we need to be able to handle 'cancel' or an error
1544          * without hanging forever.
1545          */
1546         while (!rdma->errored && !rdma->received_error) {
1547             GPollFD pfds[2];
1548             pfds[0].fd = comp_channel->fd;
1549             pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1550             pfds[0].revents = 0;
1551
1552             pfds[1].fd = rdma->channel->fd;
1553             pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1554             pfds[1].revents = 0;
1555
1556             /* 0.1s timeout, should be fine for a 'cancel' */
1557             switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1558             case 2:
1559             case 1: /* fd active */
1560                 if (pfds[0].revents) {
1561                     return 0;
1562                 }
1563
1564                 if (pfds[1].revents) {
1565                     ret = rdma_get_cm_event(rdma->channel, &cm_event);
1566                     if (ret < 0) {
1567                         error_report("failed to get cm event while wait "
1568                                      "completion channel");
1569                         return -1;
1570                     }
1571
1572                     error_report("receive cm event while wait comp channel,"
1573                                  "cm event is %d", cm_event->event);
1574                     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1575                         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1576                         rdma_ack_cm_event(cm_event);
1577                         return -1;
1578                     }
1579                     rdma_ack_cm_event(cm_event);
1580                 }
1581                 break;
1582
1583             case 0: /* Timeout, go around again */
1584                 break;
1585
1586             default: /* Error of some type -
1587                       * I don't trust errno from qemu_poll_ns
1588                      */
1589                 error_report("%s: poll failed", __func__);
1590                 return -1;
1591             }
1592
1593             if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1594                 /* Bail out and let the cancellation happen */
1595                 return -1;
1596             }
1597         }
1598     }
1599
1600     if (rdma->received_error) {
1601         return -1;
1602     }
1603     return -rdma->errored;
1604 }
1605
1606 static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid)
1607 {
1608     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1609            rdma->recv_comp_channel;
1610 }
1611
1612 static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid)
1613 {
1614     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1615 }
1616
1617 /*
1618  * Block until the next work request has completed.
1619  *
1620  * First poll to see if a work request has already completed,
1621  * otherwise block.
1622  *
1623  * If we encounter completed work requests for IDs other than
1624  * the one we're interested in, then that's generally an error.
1625  *
1626  * The only exception is actual RDMA Write completions. These
1627  * completions only need to be recorded, but do not actually
1628  * need further processing.
1629  */
1630 static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
1631                                     uint64_t wrid_requested,
1632                                     uint32_t *byte_len)
1633 {
1634     int num_cq_events = 0, ret;
1635     struct ibv_cq *cq;
1636     void *cq_ctx;
1637     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1638     struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1639     struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1640
1641     if (ibv_req_notify_cq(poll_cq, 0)) {
1642         return -1;
1643     }
1644     /* poll cq first */
1645     while (wr_id != wrid_requested) {
1646         ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1647         if (ret < 0) {
1648             return -1;
1649         }
1650
1651         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1652
1653         if (wr_id == RDMA_WRID_NONE) {
1654             break;
1655         }
1656         if (wr_id != wrid_requested) {
1657             trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1658         }
1659     }
1660
1661     if (wr_id == wrid_requested) {
1662         return 0;
1663     }
1664
1665     while (1) {
1666         ret = qemu_rdma_wait_comp_channel(rdma, ch);
1667         if (ret < 0) {
1668             goto err_block_for_wrid;
1669         }
1670
1671         ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
1672         if (ret < 0) {
1673             /*
1674              * FIXME perror() is problematic, because ibv_reg_mr() is
1675              * not documented to set errno.  Will go away later in
1676              * this series.
1677              */
1678             perror("ibv_get_cq_event");
1679             goto err_block_for_wrid;
1680         }
1681
1682         num_cq_events++;
1683
1684         if (ibv_req_notify_cq(cq, 0)) {
1685             goto err_block_for_wrid;
1686         }
1687
1688         while (wr_id != wrid_requested) {
1689             ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1690             if (ret < 0) {
1691                 goto err_block_for_wrid;
1692             }
1693
1694             wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1695
1696             if (wr_id == RDMA_WRID_NONE) {
1697                 break;
1698             }
1699             if (wr_id != wrid_requested) {
1700                 trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1701             }
1702         }
1703
1704         if (wr_id == wrid_requested) {
1705             goto success_block_for_wrid;
1706         }
1707     }
1708
1709 success_block_for_wrid:
1710     if (num_cq_events) {
1711         ibv_ack_cq_events(cq, num_cq_events);
1712     }
1713     return 0;
1714
1715 err_block_for_wrid:
1716     if (num_cq_events) {
1717         ibv_ack_cq_events(cq, num_cq_events);
1718     }
1719
1720     rdma->errored = true;
1721     return -1;
1722 }
1723
1724 /*
1725  * Post a SEND message work request for the control channel
1726  * containing some data and block until the post completes.
1727  */
1728 static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1729                                        RDMAControlHeader *head)
1730 {
1731     int ret;
1732     RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1733     struct ibv_send_wr *bad_wr;
1734     struct ibv_sge sge = {
1735                            .addr = (uintptr_t)(wr->control),
1736                            .length = head->len + sizeof(RDMAControlHeader),
1737                            .lkey = wr->control_mr->lkey,
1738                          };
1739     struct ibv_send_wr send_wr = {
1740                                    .wr_id = RDMA_WRID_SEND_CONTROL,
1741                                    .opcode = IBV_WR_SEND,
1742                                    .send_flags = IBV_SEND_SIGNALED,
1743                                    .sg_list = &sge,
1744                                    .num_sge = 1,
1745                                 };
1746
1747     trace_qemu_rdma_post_send_control(control_desc(head->type));
1748
1749     /*
1750      * We don't actually need to do a memcpy() in here if we used
1751      * the "sge" properly, but since we're only sending control messages
1752      * (not RAM in a performance-critical path), then its OK for now.
1753      *
1754      * The copy makes the RDMAControlHeader simpler to manipulate
1755      * for the time being.
1756      */
1757     assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1758     memcpy(wr->control, head, sizeof(RDMAControlHeader));
1759     control_to_network((void *) wr->control);
1760
1761     if (buf) {
1762         memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1763     }
1764
1765
1766     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1767
1768     if (ret > 0) {
1769         error_report("Failed to use post IB SEND for control");
1770         return -1;
1771     }
1772
1773     ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1774     if (ret < 0) {
1775         error_report("rdma migration: send polling control error");
1776         return -1;
1777     }
1778
1779     return 0;
1780 }
1781
1782 /*
1783  * Post a RECV work request in anticipation of some future receipt
1784  * of data on the control channel.
1785  */
1786 static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1787 {
1788     struct ibv_recv_wr *bad_wr;
1789     struct ibv_sge sge = {
1790                             .addr = (uintptr_t)(rdma->wr_data[idx].control),
1791                             .length = RDMA_CONTROL_MAX_BUFFER,
1792                             .lkey = rdma->wr_data[idx].control_mr->lkey,
1793                          };
1794
1795     struct ibv_recv_wr recv_wr = {
1796                                     .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1797                                     .sg_list = &sge,
1798                                     .num_sge = 1,
1799                                  };
1800
1801
1802     if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1803         return -1;
1804     }
1805
1806     return 0;
1807 }
1808
1809 /*
1810  * Block and wait for a RECV control channel message to arrive.
1811  */
1812 static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1813                 RDMAControlHeader *head, uint32_t expecting, int idx)
1814 {
1815     uint32_t byte_len;
1816     int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1817                                        &byte_len);
1818
1819     if (ret < 0) {
1820         error_report("rdma migration: recv polling control error!");
1821         return -1;
1822     }
1823
1824     network_to_control((void *) rdma->wr_data[idx].control);
1825     memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1826
1827     trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1828
1829     if (expecting == RDMA_CONTROL_NONE) {
1830         trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1831                                              head->type);
1832     } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1833         error_report("Was expecting a %s (%d) control message"
1834                 ", but got: %s (%d), length: %d",
1835                 control_desc(expecting), expecting,
1836                 control_desc(head->type), head->type, head->len);
1837         if (head->type == RDMA_CONTROL_ERROR) {
1838             rdma->received_error = true;
1839         }
1840         return -1;
1841     }
1842     if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1843         error_report("too long length: %d", head->len);
1844         return -1;
1845     }
1846     if (sizeof(*head) + head->len != byte_len) {
1847         error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1848         return -1;
1849     }
1850
1851     return 0;
1852 }
1853
1854 /*
1855  * When a RECV work request has completed, the work request's
1856  * buffer is pointed at the header.
1857  *
1858  * This will advance the pointer to the data portion
1859  * of the control message of the work request's buffer that
1860  * was populated after the work request finished.
1861  */
1862 static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1863                                   RDMAControlHeader *head)
1864 {
1865     rdma->wr_data[idx].control_len = head->len;
1866     rdma->wr_data[idx].control_curr =
1867         rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1868 }
1869
1870 /*
1871  * This is an 'atomic' high-level operation to deliver a single, unified
1872  * control-channel message.
1873  *
1874  * Additionally, if the user is expecting some kind of reply to this message,
1875  * they can request a 'resp' response message be filled in by posting an
1876  * additional work request on behalf of the user and waiting for an additional
1877  * completion.
1878  *
1879  * The extra (optional) response is used during registration to us from having
1880  * to perform an *additional* exchange of message just to provide a response by
1881  * instead piggy-backing on the acknowledgement.
1882  */
1883 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1884                                    uint8_t *data, RDMAControlHeader *resp,
1885                                    int *resp_idx,
1886                                    int (*callback)(RDMAContext *rdma))
1887 {
1888     int ret;
1889
1890     /*
1891      * Wait until the dest is ready before attempting to deliver the message
1892      * by waiting for a READY message.
1893      */
1894     if (rdma->control_ready_expected) {
1895         RDMAControlHeader resp_ignored;
1896
1897         ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored,
1898                                               RDMA_CONTROL_READY,
1899                                               RDMA_WRID_READY);
1900         if (ret < 0) {
1901             return -1;
1902         }
1903     }
1904
1905     /*
1906      * If the user is expecting a response, post a WR in anticipation of it.
1907      */
1908     if (resp) {
1909         ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1910         if (ret < 0) {
1911             error_report("rdma migration: error posting"
1912                     " extra control recv for anticipated result!");
1913             return -1;
1914         }
1915     }
1916
1917     /*
1918      * Post a WR to replace the one we just consumed for the READY message.
1919      */
1920     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1921     if (ret < 0) {
1922         error_report("rdma migration: error posting first control recv!");
1923         return -1;
1924     }
1925
1926     /*
1927      * Deliver the control message that was requested.
1928      */
1929     ret = qemu_rdma_post_send_control(rdma, data, head);
1930
1931     if (ret < 0) {
1932         error_report("Failed to send control buffer!");
1933         return -1;
1934     }
1935
1936     /*
1937      * If we're expecting a response, block and wait for it.
1938      */
1939     if (resp) {
1940         if (callback) {
1941             trace_qemu_rdma_exchange_send_issue_callback();
1942             ret = callback(rdma);
1943             if (ret < 0) {
1944                 return -1;
1945             }
1946         }
1947
1948         trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1949         ret = qemu_rdma_exchange_get_response(rdma, resp,
1950                                               resp->type, RDMA_WRID_DATA);
1951
1952         if (ret < 0) {
1953             return -1;
1954         }
1955
1956         qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1957         if (resp_idx) {
1958             *resp_idx = RDMA_WRID_DATA;
1959         }
1960         trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1961     }
1962
1963     rdma->control_ready_expected = 1;
1964
1965     return 0;
1966 }
1967
1968 /*
1969  * This is an 'atomic' high-level operation to receive a single, unified
1970  * control-channel message.
1971  */
1972 static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1973                                    uint32_t expecting)
1974 {
1975     RDMAControlHeader ready = {
1976                                 .len = 0,
1977                                 .type = RDMA_CONTROL_READY,
1978                                 .repeat = 1,
1979                               };
1980     int ret;
1981
1982     /*
1983      * Inform the source that we're ready to receive a message.
1984      */
1985     ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1986
1987     if (ret < 0) {
1988         error_report("Failed to send control buffer!");
1989         return -1;
1990     }
1991
1992     /*
1993      * Block and wait for the message.
1994      */
1995     ret = qemu_rdma_exchange_get_response(rdma, head,
1996                                           expecting, RDMA_WRID_READY);
1997
1998     if (ret < 0) {
1999         return -1;
2000     }
2001
2002     qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
2003
2004     /*
2005      * Post a new RECV work request to replace the one we just consumed.
2006      */
2007     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2008     if (ret < 0) {
2009         error_report("rdma migration: error posting second control recv!");
2010         return -1;
2011     }
2012
2013     return 0;
2014 }
2015
2016 /*
2017  * Write an actual chunk of memory using RDMA.
2018  *
2019  * If we're using dynamic registration on the dest-side, we have to
2020  * send a registration command first.
2021  */
2022 static int qemu_rdma_write_one(RDMAContext *rdma,
2023                                int current_index, uint64_t current_addr,
2024                                uint64_t length)
2025 {
2026     struct ibv_sge sge;
2027     struct ibv_send_wr send_wr = { 0 };
2028     struct ibv_send_wr *bad_wr;
2029     int reg_result_idx, ret, count = 0;
2030     uint64_t chunk, chunks;
2031     uint8_t *chunk_start, *chunk_end;
2032     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2033     RDMARegister reg;
2034     RDMARegisterResult *reg_result;
2035     RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2036     RDMAControlHeader head = { .len = sizeof(RDMARegister),
2037                                .type = RDMA_CONTROL_REGISTER_REQUEST,
2038                                .repeat = 1,
2039                              };
2040
2041 retry:
2042     sge.addr = (uintptr_t)(block->local_host_addr +
2043                             (current_addr - block->offset));
2044     sge.length = length;
2045
2046     chunk = ram_chunk_index(block->local_host_addr,
2047                             (uint8_t *)(uintptr_t)sge.addr);
2048     chunk_start = ram_chunk_start(block, chunk);
2049
2050     if (block->is_ram_block) {
2051         chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2052
2053         if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2054             chunks--;
2055         }
2056     } else {
2057         chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2058
2059         if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2060             chunks--;
2061         }
2062     }
2063
2064     trace_qemu_rdma_write_one_top(chunks + 1,
2065                                   (chunks + 1) *
2066                                   (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2067
2068     chunk_end = ram_chunk_end(block, chunk + chunks);
2069
2070
2071     while (test_bit(chunk, block->transit_bitmap)) {
2072         (void)count;
2073         trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2074                 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2075
2076         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2077
2078         if (ret < 0) {
2079             error_report("Failed to Wait for previous write to complete "
2080                     "block %d chunk %" PRIu64
2081                     " current %" PRIu64 " len %" PRIu64 " %d",
2082                     current_index, chunk, sge.addr, length, rdma->nb_sent);
2083             return -1;
2084         }
2085     }
2086
2087     if (!rdma->pin_all || !block->is_ram_block) {
2088         if (!block->remote_keys[chunk]) {
2089             /*
2090              * This chunk has not yet been registered, so first check to see
2091              * if the entire chunk is zero. If so, tell the other size to
2092              * memset() + madvise() the entire chunk without RDMA.
2093              */
2094
2095             if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2096                 RDMACompress comp = {
2097                                         .offset = current_addr,
2098                                         .value = 0,
2099                                         .block_idx = current_index,
2100                                         .length = length,
2101                                     };
2102
2103                 head.len = sizeof(comp);
2104                 head.type = RDMA_CONTROL_COMPRESS;
2105
2106                 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2107                                                current_index, current_addr);
2108
2109                 compress_to_network(rdma, &comp);
2110                 ret = qemu_rdma_exchange_send(rdma, &head,
2111                                 (uint8_t *) &comp, NULL, NULL, NULL);
2112
2113                 if (ret < 0) {
2114                     return -1;
2115                 }
2116
2117                 /*
2118                  * TODO: Here we are sending something, but we are not
2119                  * accounting for anything transferred.  The following is wrong:
2120                  *
2121                  * stat64_add(&mig_stats.rdma_bytes, sge.length);
2122                  *
2123                  * because we are using some kind of compression.  I
2124                  * would think that head.len would be the more similar
2125                  * thing to a correct value.
2126                  */
2127                 stat64_add(&mig_stats.zero_pages,
2128                            sge.length / qemu_target_page_size());
2129                 return 1;
2130             }
2131
2132             /*
2133              * Otherwise, tell other side to register.
2134              */
2135             reg.current_index = current_index;
2136             if (block->is_ram_block) {
2137                 reg.key.current_addr = current_addr;
2138             } else {
2139                 reg.key.chunk = chunk;
2140             }
2141             reg.chunks = chunks;
2142
2143             trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2144                                               current_addr);
2145
2146             register_to_network(rdma, &reg);
2147             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
2148                                     &resp, &reg_result_idx, NULL);
2149             if (ret < 0) {
2150                 return -1;
2151             }
2152
2153             /* try to overlap this single registration with the one we sent. */
2154             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2155                                                 &sge.lkey, NULL, chunk,
2156                                                 chunk_start, chunk_end)) {
2157                 error_report("cannot get lkey");
2158                 return -1;
2159             }
2160
2161             reg_result = (RDMARegisterResult *)
2162                     rdma->wr_data[reg_result_idx].control_curr;
2163
2164             network_to_result(reg_result);
2165
2166             trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2167                                                  reg_result->rkey, chunk);
2168
2169             block->remote_keys[chunk] = reg_result->rkey;
2170             block->remote_host_addr = reg_result->host_addr;
2171         } else {
2172             /* already registered before */
2173             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2174                                                 &sge.lkey, NULL, chunk,
2175                                                 chunk_start, chunk_end)) {
2176                 error_report("cannot get lkey!");
2177                 return -1;
2178             }
2179         }
2180
2181         send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2182     } else {
2183         send_wr.wr.rdma.rkey = block->remote_rkey;
2184
2185         if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2186                                                      &sge.lkey, NULL, chunk,
2187                                                      chunk_start, chunk_end)) {
2188             error_report("cannot get lkey!");
2189             return -1;
2190         }
2191     }
2192
2193     /*
2194      * Encode the ram block index and chunk within this wrid.
2195      * We will use this information at the time of completion
2196      * to figure out which bitmap to check against and then which
2197      * chunk in the bitmap to look for.
2198      */
2199     send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2200                                         current_index, chunk);
2201
2202     send_wr.opcode = IBV_WR_RDMA_WRITE;
2203     send_wr.send_flags = IBV_SEND_SIGNALED;
2204     send_wr.sg_list = &sge;
2205     send_wr.num_sge = 1;
2206     send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2207                                 (current_addr - block->offset);
2208
2209     trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2210                                    sge.length);
2211
2212     /*
2213      * ibv_post_send() does not return negative error numbers,
2214      * per the specification they are positive - no idea why.
2215      */
2216     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2217
2218     if (ret == ENOMEM) {
2219         trace_qemu_rdma_write_one_queue_full();
2220         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2221         if (ret < 0) {
2222             error_report("rdma migration: failed to make "
2223                          "room in full send queue!");
2224             return -1;
2225         }
2226
2227         goto retry;
2228
2229     } else if (ret > 0) {
2230         /*
2231          * FIXME perror() is problematic, because whether
2232          * ibv_post_send() sets errno is unclear.  Will go away later
2233          * in this series.
2234          */
2235         perror("rdma migration: post rdma write failed");
2236         return -1;
2237     }
2238
2239     set_bit(chunk, block->transit_bitmap);
2240     stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size());
2241     /*
2242      * We are adding to transferred the amount of data written, but no
2243      * overhead at all.  I will asume that RDMA is magicaly and don't
2244      * need to transfer (at least) the addresses where it wants to
2245      * write the pages.  Here it looks like it should be something
2246      * like:
2247      *     sizeof(send_wr) + sge.length
2248      * but this being RDMA, who knows.
2249      */
2250     stat64_add(&mig_stats.rdma_bytes, sge.length);
2251     ram_transferred_add(sge.length);
2252     rdma->total_writes++;
2253
2254     return 0;
2255 }
2256
2257 /*
2258  * Push out any unwritten RDMA operations.
2259  *
2260  * We support sending out multiple chunks at the same time.
2261  * Not all of them need to get signaled in the completion queue.
2262  */
2263 static int qemu_rdma_write_flush(RDMAContext *rdma)
2264 {
2265     int ret;
2266
2267     if (!rdma->current_length) {
2268         return 0;
2269     }
2270
2271     ret = qemu_rdma_write_one(rdma,
2272             rdma->current_index, rdma->current_addr, rdma->current_length);
2273
2274     if (ret < 0) {
2275         return -1;
2276     }
2277
2278     if (ret == 0) {
2279         rdma->nb_sent++;
2280         trace_qemu_rdma_write_flush(rdma->nb_sent);
2281     }
2282
2283     rdma->current_length = 0;
2284     rdma->current_addr = 0;
2285
2286     return 0;
2287 }
2288
2289 static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma,
2290                     uint64_t offset, uint64_t len)
2291 {
2292     RDMALocalBlock *block;
2293     uint8_t *host_addr;
2294     uint8_t *chunk_end;
2295
2296     if (rdma->current_index < 0) {
2297         return false;
2298     }
2299
2300     if (rdma->current_chunk < 0) {
2301         return false;
2302     }
2303
2304     block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2305     host_addr = block->local_host_addr + (offset - block->offset);
2306     chunk_end = ram_chunk_end(block, rdma->current_chunk);
2307
2308     if (rdma->current_length == 0) {
2309         return false;
2310     }
2311
2312     /*
2313      * Only merge into chunk sequentially.
2314      */
2315     if (offset != (rdma->current_addr + rdma->current_length)) {
2316         return false;
2317     }
2318
2319     if (offset < block->offset) {
2320         return false;
2321     }
2322
2323     if ((offset + len) > (block->offset + block->length)) {
2324         return false;
2325     }
2326
2327     if ((host_addr + len) > chunk_end) {
2328         return false;
2329     }
2330
2331     return true;
2332 }
2333
2334 /*
2335  * We're not actually writing here, but doing three things:
2336  *
2337  * 1. Identify the chunk the buffer belongs to.
2338  * 2. If the chunk is full or the buffer doesn't belong to the current
2339  *    chunk, then start a new chunk and flush() the old chunk.
2340  * 3. To keep the hardware busy, we also group chunks into batches
2341  *    and only require that a batch gets acknowledged in the completion
2342  *    queue instead of each individual chunk.
2343  */
2344 static int qemu_rdma_write(RDMAContext *rdma,
2345                            uint64_t block_offset, uint64_t offset,
2346                            uint64_t len)
2347 {
2348     uint64_t current_addr = block_offset + offset;
2349     uint64_t index = rdma->current_index;
2350     uint64_t chunk = rdma->current_chunk;
2351     int ret;
2352
2353     /* If we cannot merge it, we flush the current buffer first. */
2354     if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) {
2355         ret = qemu_rdma_write_flush(rdma);
2356         if (ret < 0) {
2357             return -1;
2358         }
2359         rdma->current_length = 0;
2360         rdma->current_addr = current_addr;
2361
2362         qemu_rdma_search_ram_block(rdma, block_offset,
2363                                    offset, len, &index, &chunk);
2364         rdma->current_index = index;
2365         rdma->current_chunk = chunk;
2366     }
2367
2368     /* merge it */
2369     rdma->current_length += len;
2370
2371     /* flush it if buffer is too large */
2372     if (rdma->current_length >= RDMA_MERGE_MAX) {
2373         return qemu_rdma_write_flush(rdma);
2374     }
2375
2376     return 0;
2377 }
2378
2379 static void qemu_rdma_cleanup(RDMAContext *rdma)
2380 {
2381     int idx;
2382
2383     if (rdma->cm_id && rdma->connected) {
2384         if ((rdma->errored ||
2385              migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2386             !rdma->received_error) {
2387             RDMAControlHeader head = { .len = 0,
2388                                        .type = RDMA_CONTROL_ERROR,
2389                                        .repeat = 1,
2390                                      };
2391             error_report("Early error. Sending error.");
2392             qemu_rdma_post_send_control(rdma, NULL, &head);
2393         }
2394
2395         rdma_disconnect(rdma->cm_id);
2396         trace_qemu_rdma_cleanup_disconnect();
2397         rdma->connected = false;
2398     }
2399
2400     if (rdma->channel) {
2401         qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2402     }
2403     g_free(rdma->dest_blocks);
2404     rdma->dest_blocks = NULL;
2405
2406     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2407         if (rdma->wr_data[idx].control_mr) {
2408             rdma->total_registrations--;
2409             ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2410         }
2411         rdma->wr_data[idx].control_mr = NULL;
2412     }
2413
2414     if (rdma->local_ram_blocks.block) {
2415         while (rdma->local_ram_blocks.nb_blocks) {
2416             rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2417         }
2418     }
2419
2420     if (rdma->qp) {
2421         rdma_destroy_qp(rdma->cm_id);
2422         rdma->qp = NULL;
2423     }
2424     if (rdma->recv_cq) {
2425         ibv_destroy_cq(rdma->recv_cq);
2426         rdma->recv_cq = NULL;
2427     }
2428     if (rdma->send_cq) {
2429         ibv_destroy_cq(rdma->send_cq);
2430         rdma->send_cq = NULL;
2431     }
2432     if (rdma->recv_comp_channel) {
2433         ibv_destroy_comp_channel(rdma->recv_comp_channel);
2434         rdma->recv_comp_channel = NULL;
2435     }
2436     if (rdma->send_comp_channel) {
2437         ibv_destroy_comp_channel(rdma->send_comp_channel);
2438         rdma->send_comp_channel = NULL;
2439     }
2440     if (rdma->pd) {
2441         ibv_dealloc_pd(rdma->pd);
2442         rdma->pd = NULL;
2443     }
2444     if (rdma->cm_id) {
2445         rdma_destroy_id(rdma->cm_id);
2446         rdma->cm_id = NULL;
2447     }
2448
2449     /* the destination side, listen_id and channel is shared */
2450     if (rdma->listen_id) {
2451         if (!rdma->is_return_path) {
2452             rdma_destroy_id(rdma->listen_id);
2453         }
2454         rdma->listen_id = NULL;
2455
2456         if (rdma->channel) {
2457             if (!rdma->is_return_path) {
2458                 rdma_destroy_event_channel(rdma->channel);
2459             }
2460             rdma->channel = NULL;
2461         }
2462     }
2463
2464     if (rdma->channel) {
2465         rdma_destroy_event_channel(rdma->channel);
2466         rdma->channel = NULL;
2467     }
2468     g_free(rdma->host);
2469     g_free(rdma->host_port);
2470     rdma->host = NULL;
2471     rdma->host_port = NULL;
2472 }
2473
2474
2475 static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2476 {
2477     int ret, idx;
2478
2479     /*
2480      * Will be validated against destination's actual capabilities
2481      * after the connect() completes.
2482      */
2483     rdma->pin_all = pin_all;
2484
2485     ret = qemu_rdma_resolve_host(rdma, errp);
2486     if (ret < 0) {
2487         goto err_rdma_source_init;
2488     }
2489
2490     ret = qemu_rdma_alloc_pd_cq(rdma);
2491     if (ret < 0) {
2492         ERROR(errp, "rdma migration: error allocating pd and cq! Your mlock()"
2493                     " limits may be too low. Please check $ ulimit -a # and "
2494                     "search for 'ulimit -l' in the output");
2495         goto err_rdma_source_init;
2496     }
2497
2498     ret = qemu_rdma_alloc_qp(rdma);
2499     if (ret < 0) {
2500         ERROR(errp, "rdma migration: error allocating qp!");
2501         goto err_rdma_source_init;
2502     }
2503
2504     qemu_rdma_init_ram_blocks(rdma);
2505
2506     /* Build the hash that maps from offset to RAMBlock */
2507     rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2508     for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2509         g_hash_table_insert(rdma->blockmap,
2510                 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2511                 &rdma->local_ram_blocks.block[idx]);
2512     }
2513
2514     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2515         ret = qemu_rdma_reg_control(rdma, idx);
2516         if (ret < 0) {
2517             ERROR(errp, "rdma migration: error registering %d control!",
2518                                                             idx);
2519             goto err_rdma_source_init;
2520         }
2521     }
2522
2523     return 0;
2524
2525 err_rdma_source_init:
2526     qemu_rdma_cleanup(rdma);
2527     return -1;
2528 }
2529
2530 static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2531                                      struct rdma_cm_event **cm_event,
2532                                      long msec, Error **errp)
2533 {
2534     int ret;
2535     struct pollfd poll_fd = {
2536                                 .fd = rdma->channel->fd,
2537                                 .events = POLLIN,
2538                                 .revents = 0
2539                             };
2540
2541     do {
2542         ret = poll(&poll_fd, 1, msec);
2543     } while (ret < 0 && errno == EINTR);
2544
2545     if (ret == 0) {
2546         ERROR(errp, "poll cm event timeout");
2547         return -1;
2548     } else if (ret < 0) {
2549         ERROR(errp, "failed to poll cm event, errno=%i", errno);
2550         return -1;
2551     } else if (poll_fd.revents & POLLIN) {
2552         if (rdma_get_cm_event(rdma->channel, cm_event) < 0) {
2553             ERROR(errp, "failed to get cm event");
2554             return -1;
2555         }
2556         return 0;
2557     } else {
2558         ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents);
2559         return -1;
2560     }
2561 }
2562
2563 static int qemu_rdma_connect(RDMAContext *rdma, bool return_path,
2564                              Error **errp)
2565 {
2566     RDMACapabilities cap = {
2567                                 .version = RDMA_CONTROL_VERSION_CURRENT,
2568                                 .flags = 0,
2569                            };
2570     struct rdma_conn_param conn_param = { .initiator_depth = 2,
2571                                           .retry_count = 5,
2572                                           .private_data = &cap,
2573                                           .private_data_len = sizeof(cap),
2574                                         };
2575     struct rdma_cm_event *cm_event;
2576     int ret;
2577
2578     /*
2579      * Only negotiate the capability with destination if the user
2580      * on the source first requested the capability.
2581      */
2582     if (rdma->pin_all) {
2583         trace_qemu_rdma_connect_pin_all_requested();
2584         cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2585     }
2586
2587     caps_to_network(&cap);
2588
2589     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2590     if (ret < 0) {
2591         ERROR(errp, "posting second control recv");
2592         goto err_rdma_source_connect;
2593     }
2594
2595     ret = rdma_connect(rdma->cm_id, &conn_param);
2596     if (ret < 0) {
2597         perror("rdma_connect");
2598         ERROR(errp, "connecting to destination!");
2599         goto err_rdma_source_connect;
2600     }
2601
2602     if (return_path) {
2603         ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2604     } else {
2605         ret = rdma_get_cm_event(rdma->channel, &cm_event);
2606         if (ret < 0) {
2607             ERROR(errp, "failed to get cm event");
2608         }
2609     }
2610     if (ret < 0) {
2611         /*
2612          * FIXME perror() is wrong, because
2613          * qemu_get_cm_event_timeout() can fail without setting errno.
2614          * Will go away later in this series.
2615          */
2616         perror("rdma_get_cm_event after rdma_connect");
2617         goto err_rdma_source_connect;
2618     }
2619
2620     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2621         error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2622         ERROR(errp, "connecting to destination!");
2623         rdma_ack_cm_event(cm_event);
2624         goto err_rdma_source_connect;
2625     }
2626     rdma->connected = true;
2627
2628     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2629     network_to_caps(&cap);
2630
2631     /*
2632      * Verify that the *requested* capabilities are supported by the destination
2633      * and disable them otherwise.
2634      */
2635     if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2636         warn_report("RDMA: Server cannot support pinning all memory. "
2637                     "Will register memory dynamically.");
2638         rdma->pin_all = false;
2639     }
2640
2641     trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2642
2643     rdma_ack_cm_event(cm_event);
2644
2645     rdma->control_ready_expected = 1;
2646     rdma->nb_sent = 0;
2647     return 0;
2648
2649 err_rdma_source_connect:
2650     qemu_rdma_cleanup(rdma);
2651     return -1;
2652 }
2653
2654 static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2655 {
2656     int ret, idx;
2657     struct rdma_cm_id *listen_id;
2658     char ip[40] = "unknown";
2659     struct rdma_addrinfo *res, *e;
2660     char port_str[16];
2661     int reuse = 1;
2662
2663     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2664         rdma->wr_data[idx].control_len = 0;
2665         rdma->wr_data[idx].control_curr = NULL;
2666     }
2667
2668     if (!rdma->host || !rdma->host[0]) {
2669         ERROR(errp, "RDMA host is not set!");
2670         rdma->errored = true;
2671         return -1;
2672     }
2673     /* create CM channel */
2674     rdma->channel = rdma_create_event_channel();
2675     if (!rdma->channel) {
2676         ERROR(errp, "could not create rdma event channel");
2677         rdma->errored = true;
2678         return -1;
2679     }
2680
2681     /* create CM id */
2682     ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2683     if (ret < 0) {
2684         ERROR(errp, "could not create cm_id!");
2685         goto err_dest_init_create_listen_id;
2686     }
2687
2688     snprintf(port_str, 16, "%d", rdma->port);
2689     port_str[15] = '\0';
2690
2691     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2692     if (ret) {
2693         ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2694         goto err_dest_init_bind_addr;
2695     }
2696
2697     ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
2698                           &reuse, sizeof reuse);
2699     if (ret < 0) {
2700         ERROR(errp, "Error: could not set REUSEADDR option");
2701         goto err_dest_init_bind_addr;
2702     }
2703     for (e = res; e != NULL; e = e->ai_next) {
2704         inet_ntop(e->ai_family,
2705             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2706         trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2707         ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2708         if (ret < 0) {
2709             continue;
2710         }
2711         if (e->ai_family == AF_INET6) {
2712             ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2713             if (ret < 0) {
2714                 continue;
2715             }
2716         }
2717         break;
2718     }
2719
2720     rdma_freeaddrinfo(res);
2721     if (!e) {
2722         ERROR(errp, "Error: could not rdma_bind_addr!");
2723         goto err_dest_init_bind_addr;
2724     }
2725
2726     rdma->listen_id = listen_id;
2727     qemu_rdma_dump_gid("dest_init", listen_id);
2728     return 0;
2729
2730 err_dest_init_bind_addr:
2731     rdma_destroy_id(listen_id);
2732 err_dest_init_create_listen_id:
2733     rdma_destroy_event_channel(rdma->channel);
2734     rdma->channel = NULL;
2735     rdma->errored = true;
2736     return -1;
2737
2738 }
2739
2740 static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2741                                             RDMAContext *rdma)
2742 {
2743     int idx;
2744
2745     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2746         rdma_return_path->wr_data[idx].control_len = 0;
2747         rdma_return_path->wr_data[idx].control_curr = NULL;
2748     }
2749
2750     /*the CM channel and CM id is shared*/
2751     rdma_return_path->channel = rdma->channel;
2752     rdma_return_path->listen_id = rdma->listen_id;
2753
2754     rdma->return_path = rdma_return_path;
2755     rdma_return_path->return_path = rdma;
2756     rdma_return_path->is_return_path = true;
2757 }
2758
2759 static RDMAContext *qemu_rdma_data_init(const char *host_port, Error **errp)
2760 {
2761     RDMAContext *rdma = NULL;
2762     InetSocketAddress *addr;
2763
2764     rdma = g_new0(RDMAContext, 1);
2765     rdma->current_index = -1;
2766     rdma->current_chunk = -1;
2767
2768     addr = g_new(InetSocketAddress, 1);
2769     if (!inet_parse(addr, host_port, NULL)) {
2770         rdma->port = atoi(addr->port);
2771         rdma->host = g_strdup(addr->host);
2772         rdma->host_port = g_strdup(host_port);
2773     } else {
2774         ERROR(errp, "bad RDMA migration address '%s'", host_port);
2775         g_free(rdma);
2776         rdma = NULL;
2777     }
2778
2779     qapi_free_InetSocketAddress(addr);
2780     return rdma;
2781 }
2782
2783 /*
2784  * QEMUFile interface to the control channel.
2785  * SEND messages for control only.
2786  * VM's ram is handled with regular RDMA messages.
2787  */
2788 static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2789                                        const struct iovec *iov,
2790                                        size_t niov,
2791                                        int *fds,
2792                                        size_t nfds,
2793                                        int flags,
2794                                        Error **errp)
2795 {
2796     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2797     RDMAContext *rdma;
2798     int ret;
2799     ssize_t done = 0;
2800     size_t i, len;
2801
2802     RCU_READ_LOCK_GUARD();
2803     rdma = qatomic_rcu_read(&rioc->rdmaout);
2804
2805     if (!rdma) {
2806         error_setg(errp, "RDMA control channel output is not set");
2807         return -1;
2808     }
2809
2810     if (rdma->errored) {
2811         error_setg(errp,
2812                    "RDMA is in an error state waiting migration to abort!");
2813         return -1;
2814     }
2815
2816     /*
2817      * Push out any writes that
2818      * we're queued up for VM's ram.
2819      */
2820     ret = qemu_rdma_write_flush(rdma);
2821     if (ret < 0) {
2822         rdma->errored = true;
2823         error_setg(errp, "qemu_rdma_write_flush failed");
2824         return -1;
2825     }
2826
2827     for (i = 0; i < niov; i++) {
2828         size_t remaining = iov[i].iov_len;
2829         uint8_t * data = (void *)iov[i].iov_base;
2830         while (remaining) {
2831             RDMAControlHeader head = {};
2832
2833             len = MIN(remaining, RDMA_SEND_INCREMENT);
2834             remaining -= len;
2835
2836             head.len = len;
2837             head.type = RDMA_CONTROL_QEMU_FILE;
2838
2839             ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2840
2841             if (ret < 0) {
2842                 rdma->errored = true;
2843                 error_setg(errp, "qemu_rdma_exchange_send failed");
2844                 return -1;
2845             }
2846
2847             data += len;
2848             done += len;
2849         }
2850     }
2851
2852     return done;
2853 }
2854
2855 static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2856                              size_t size, int idx)
2857 {
2858     size_t len = 0;
2859
2860     if (rdma->wr_data[idx].control_len) {
2861         trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2862
2863         len = MIN(size, rdma->wr_data[idx].control_len);
2864         memcpy(buf, rdma->wr_data[idx].control_curr, len);
2865         rdma->wr_data[idx].control_curr += len;
2866         rdma->wr_data[idx].control_len -= len;
2867     }
2868
2869     return len;
2870 }
2871
2872 /*
2873  * QEMUFile interface to the control channel.
2874  * RDMA links don't use bytestreams, so we have to
2875  * return bytes to QEMUFile opportunistically.
2876  */
2877 static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2878                                       const struct iovec *iov,
2879                                       size_t niov,
2880                                       int **fds,
2881                                       size_t *nfds,
2882                                       int flags,
2883                                       Error **errp)
2884 {
2885     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2886     RDMAContext *rdma;
2887     RDMAControlHeader head;
2888     int ret;
2889     ssize_t done = 0;
2890     size_t i, len;
2891
2892     RCU_READ_LOCK_GUARD();
2893     rdma = qatomic_rcu_read(&rioc->rdmain);
2894
2895     if (!rdma) {
2896         error_setg(errp, "RDMA control channel input is not set");
2897         return -1;
2898     }
2899
2900     if (rdma->errored) {
2901         error_setg(errp,
2902                    "RDMA is in an error state waiting migration to abort!");
2903         return -1;
2904     }
2905
2906     for (i = 0; i < niov; i++) {
2907         size_t want = iov[i].iov_len;
2908         uint8_t *data = (void *)iov[i].iov_base;
2909
2910         /*
2911          * First, we hold on to the last SEND message we
2912          * were given and dish out the bytes until we run
2913          * out of bytes.
2914          */
2915         len = qemu_rdma_fill(rdma, data, want, 0);
2916         done += len;
2917         want -= len;
2918         /* Got what we needed, so go to next iovec */
2919         if (want == 0) {
2920             continue;
2921         }
2922
2923         /* If we got any data so far, then don't wait
2924          * for more, just return what we have */
2925         if (done > 0) {
2926             break;
2927         }
2928
2929
2930         /* We've got nothing at all, so lets wait for
2931          * more to arrive
2932          */
2933         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2934
2935         if (ret < 0) {
2936             rdma->errored = true;
2937             error_setg(errp, "qemu_rdma_exchange_recv failed");
2938             return -1;
2939         }
2940
2941         /*
2942          * SEND was received with new bytes, now try again.
2943          */
2944         len = qemu_rdma_fill(rdma, data, want, 0);
2945         done += len;
2946         want -= len;
2947
2948         /* Still didn't get enough, so lets just return */
2949         if (want) {
2950             if (done == 0) {
2951                 return QIO_CHANNEL_ERR_BLOCK;
2952             } else {
2953                 break;
2954             }
2955         }
2956     }
2957     return done;
2958 }
2959
2960 /*
2961  * Block until all the outstanding chunks have been delivered by the hardware.
2962  */
2963 static int qemu_rdma_drain_cq(RDMAContext *rdma)
2964 {
2965     int ret;
2966
2967     if (qemu_rdma_write_flush(rdma) < 0) {
2968         return -1;
2969     }
2970
2971     while (rdma->nb_sent) {
2972         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2973         if (ret < 0) {
2974             error_report("rdma migration: complete polling error!");
2975             return -1;
2976         }
2977     }
2978
2979     qemu_rdma_unregister_waiting(rdma);
2980
2981     return 0;
2982 }
2983
2984
2985 static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
2986                                          bool blocking,
2987                                          Error **errp)
2988 {
2989     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2990     /* XXX we should make readv/writev actually honour this :-) */
2991     rioc->blocking = blocking;
2992     return 0;
2993 }
2994
2995
2996 typedef struct QIOChannelRDMASource QIOChannelRDMASource;
2997 struct QIOChannelRDMASource {
2998     GSource parent;
2999     QIOChannelRDMA *rioc;
3000     GIOCondition condition;
3001 };
3002
3003 static gboolean
3004 qio_channel_rdma_source_prepare(GSource *source,
3005                                 gint *timeout)
3006 {
3007     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3008     RDMAContext *rdma;
3009     GIOCondition cond = 0;
3010     *timeout = -1;
3011
3012     RCU_READ_LOCK_GUARD();
3013     if (rsource->condition == G_IO_IN) {
3014         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3015     } else {
3016         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3017     }
3018
3019     if (!rdma) {
3020         error_report("RDMAContext is NULL when prepare Gsource");
3021         return FALSE;
3022     }
3023
3024     if (rdma->wr_data[0].control_len) {
3025         cond |= G_IO_IN;
3026     }
3027     cond |= G_IO_OUT;
3028
3029     return cond & rsource->condition;
3030 }
3031
3032 static gboolean
3033 qio_channel_rdma_source_check(GSource *source)
3034 {
3035     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3036     RDMAContext *rdma;
3037     GIOCondition cond = 0;
3038
3039     RCU_READ_LOCK_GUARD();
3040     if (rsource->condition == G_IO_IN) {
3041         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3042     } else {
3043         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3044     }
3045
3046     if (!rdma) {
3047         error_report("RDMAContext is NULL when check Gsource");
3048         return FALSE;
3049     }
3050
3051     if (rdma->wr_data[0].control_len) {
3052         cond |= G_IO_IN;
3053     }
3054     cond |= G_IO_OUT;
3055
3056     return cond & rsource->condition;
3057 }
3058
3059 static gboolean
3060 qio_channel_rdma_source_dispatch(GSource *source,
3061                                  GSourceFunc callback,
3062                                  gpointer user_data)
3063 {
3064     QIOChannelFunc func = (QIOChannelFunc)callback;
3065     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3066     RDMAContext *rdma;
3067     GIOCondition cond = 0;
3068
3069     RCU_READ_LOCK_GUARD();
3070     if (rsource->condition == G_IO_IN) {
3071         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3072     } else {
3073         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3074     }
3075
3076     if (!rdma) {
3077         error_report("RDMAContext is NULL when dispatch Gsource");
3078         return FALSE;
3079     }
3080
3081     if (rdma->wr_data[0].control_len) {
3082         cond |= G_IO_IN;
3083     }
3084     cond |= G_IO_OUT;
3085
3086     return (*func)(QIO_CHANNEL(rsource->rioc),
3087                    (cond & rsource->condition),
3088                    user_data);
3089 }
3090
3091 static void
3092 qio_channel_rdma_source_finalize(GSource *source)
3093 {
3094     QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
3095
3096     object_unref(OBJECT(ssource->rioc));
3097 }
3098
3099 static GSourceFuncs qio_channel_rdma_source_funcs = {
3100     qio_channel_rdma_source_prepare,
3101     qio_channel_rdma_source_check,
3102     qio_channel_rdma_source_dispatch,
3103     qio_channel_rdma_source_finalize
3104 };
3105
3106 static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
3107                                               GIOCondition condition)
3108 {
3109     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3110     QIOChannelRDMASource *ssource;
3111     GSource *source;
3112
3113     source = g_source_new(&qio_channel_rdma_source_funcs,
3114                           sizeof(QIOChannelRDMASource));
3115     ssource = (QIOChannelRDMASource *)source;
3116
3117     ssource->rioc = rioc;
3118     object_ref(OBJECT(rioc));
3119
3120     ssource->condition = condition;
3121
3122     return source;
3123 }
3124
3125 static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
3126                                                 AioContext *read_ctx,
3127                                                 IOHandler *io_read,
3128                                                 AioContext *write_ctx,
3129                                                 IOHandler *io_write,
3130                                                 void *opaque)
3131 {
3132     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3133     if (io_read) {
3134         aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd,
3135                            io_read, io_write, NULL, NULL, opaque);
3136         aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd,
3137                            io_read, io_write, NULL, NULL, opaque);
3138     } else {
3139         aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd,
3140                            io_read, io_write, NULL, NULL, opaque);
3141         aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd,
3142                            io_read, io_write, NULL, NULL, opaque);
3143     }
3144 }
3145
3146 struct rdma_close_rcu {
3147     struct rcu_head rcu;
3148     RDMAContext *rdmain;
3149     RDMAContext *rdmaout;
3150 };
3151
3152 /* callback from qio_channel_rdma_close via call_rcu */
3153 static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3154 {
3155     if (rcu->rdmain) {
3156         qemu_rdma_cleanup(rcu->rdmain);
3157     }
3158
3159     if (rcu->rdmaout) {
3160         qemu_rdma_cleanup(rcu->rdmaout);
3161     }
3162
3163     g_free(rcu->rdmain);
3164     g_free(rcu->rdmaout);
3165     g_free(rcu);
3166 }
3167
3168 static int qio_channel_rdma_close(QIOChannel *ioc,
3169                                   Error **errp)
3170 {
3171     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3172     RDMAContext *rdmain, *rdmaout;
3173     struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3174
3175     trace_qemu_rdma_close();
3176
3177     rdmain = rioc->rdmain;
3178     if (rdmain) {
3179         qatomic_rcu_set(&rioc->rdmain, NULL);
3180     }
3181
3182     rdmaout = rioc->rdmaout;
3183     if (rdmaout) {
3184         qatomic_rcu_set(&rioc->rdmaout, NULL);
3185     }
3186
3187     rcu->rdmain = rdmain;
3188     rcu->rdmaout = rdmaout;
3189     call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
3190
3191     return 0;
3192 }
3193
3194 static int
3195 qio_channel_rdma_shutdown(QIOChannel *ioc,
3196                             QIOChannelShutdown how,
3197                             Error **errp)
3198 {
3199     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3200     RDMAContext *rdmain, *rdmaout;
3201
3202     RCU_READ_LOCK_GUARD();
3203
3204     rdmain = qatomic_rcu_read(&rioc->rdmain);
3205     rdmaout = qatomic_rcu_read(&rioc->rdmain);
3206
3207     switch (how) {
3208     case QIO_CHANNEL_SHUTDOWN_READ:
3209         if (rdmain) {
3210             rdmain->errored = true;
3211         }
3212         break;
3213     case QIO_CHANNEL_SHUTDOWN_WRITE:
3214         if (rdmaout) {
3215             rdmaout->errored = true;
3216         }
3217         break;
3218     case QIO_CHANNEL_SHUTDOWN_BOTH:
3219     default:
3220         if (rdmain) {
3221             rdmain->errored = true;
3222         }
3223         if (rdmaout) {
3224             rdmaout->errored = true;
3225         }
3226         break;
3227     }
3228
3229     return 0;
3230 }
3231
3232 /*
3233  * Parameters:
3234  *    @offset == 0 :
3235  *        This means that 'block_offset' is a full virtual address that does not
3236  *        belong to a RAMBlock of the virtual machine and instead
3237  *        represents a private malloc'd memory area that the caller wishes to
3238  *        transfer.
3239  *
3240  *    @offset != 0 :
3241  *        Offset is an offset to be added to block_offset and used
3242  *        to also lookup the corresponding RAMBlock.
3243  *
3244  *    @size : Number of bytes to transfer
3245  *
3246  *    @pages_sent : User-specificed pointer to indicate how many pages were
3247  *                  sent. Usually, this will not be more than a few bytes of
3248  *                  the protocol because most transfers are sent asynchronously.
3249  */
3250 static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset,
3251                                ram_addr_t offset, size_t size)
3252 {
3253     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3254     RDMAContext *rdma;
3255     int ret;
3256
3257     if (migration_in_postcopy()) {
3258         return RAM_SAVE_CONTROL_NOT_SUPP;
3259     }
3260
3261     RCU_READ_LOCK_GUARD();
3262     rdma = qatomic_rcu_read(&rioc->rdmaout);
3263
3264     if (!rdma) {
3265         return -1;
3266     }
3267
3268     if (rdma_errored(rdma)) {
3269         return -1;
3270     }
3271
3272     qemu_fflush(f);
3273
3274     /*
3275      * Add this page to the current 'chunk'. If the chunk
3276      * is full, or the page doesn't belong to the current chunk,
3277      * an actual RDMA write will occur and a new chunk will be formed.
3278      */
3279     ret = qemu_rdma_write(rdma, block_offset, offset, size);
3280     if (ret < 0) {
3281         error_report("rdma migration: write error");
3282         goto err;
3283     }
3284
3285     /*
3286      * Drain the Completion Queue if possible, but do not block,
3287      * just poll.
3288      *
3289      * If nothing to poll, the end of the iteration will do this
3290      * again to make sure we don't overflow the request queue.
3291      */
3292     while (1) {
3293         uint64_t wr_id, wr_id_in;
3294         ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3295
3296         if (ret < 0) {
3297             error_report("rdma migration: polling error");
3298             goto err;
3299         }
3300
3301         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3302
3303         if (wr_id == RDMA_WRID_NONE) {
3304             break;
3305         }
3306     }
3307
3308     while (1) {
3309         uint64_t wr_id, wr_id_in;
3310         ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3311
3312         if (ret < 0) {
3313             error_report("rdma migration: polling error");
3314             goto err;
3315         }
3316
3317         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3318
3319         if (wr_id == RDMA_WRID_NONE) {
3320             break;
3321         }
3322     }
3323
3324     return RAM_SAVE_CONTROL_DELAYED;
3325
3326 err:
3327     rdma->errored = true;
3328     return -1;
3329 }
3330
3331 static void rdma_accept_incoming_migration(void *opaque);
3332
3333 static void rdma_cm_poll_handler(void *opaque)
3334 {
3335     RDMAContext *rdma = opaque;
3336     int ret;
3337     struct rdma_cm_event *cm_event;
3338     MigrationIncomingState *mis = migration_incoming_get_current();
3339
3340     ret = rdma_get_cm_event(rdma->channel, &cm_event);
3341     if (ret < 0) {
3342         error_report("get_cm_event failed %d", errno);
3343         return;
3344     }
3345
3346     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3347         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3348         if (!rdma->errored &&
3349             migration_incoming_get_current()->state !=
3350               MIGRATION_STATUS_COMPLETED) {
3351             error_report("receive cm event, cm event is %d", cm_event->event);
3352             rdma->errored = true;
3353             if (rdma->return_path) {
3354                 rdma->return_path->errored = true;
3355             }
3356         }
3357         rdma_ack_cm_event(cm_event);
3358         if (mis->loadvm_co) {
3359             qemu_coroutine_enter(mis->loadvm_co);
3360         }
3361         return;
3362     }
3363     rdma_ack_cm_event(cm_event);
3364 }
3365
3366 static int qemu_rdma_accept(RDMAContext *rdma)
3367 {
3368     RDMACapabilities cap;
3369     struct rdma_conn_param conn_param = {
3370                                             .responder_resources = 2,
3371                                             .private_data = &cap,
3372                                             .private_data_len = sizeof(cap),
3373                                          };
3374     RDMAContext *rdma_return_path = NULL;
3375     struct rdma_cm_event *cm_event;
3376     struct ibv_context *verbs;
3377     int ret;
3378     int idx;
3379
3380     ret = rdma_get_cm_event(rdma->channel, &cm_event);
3381     if (ret < 0) {
3382         goto err_rdma_dest_wait;
3383     }
3384
3385     if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3386         rdma_ack_cm_event(cm_event);
3387         goto err_rdma_dest_wait;
3388     }
3389
3390     /*
3391      * initialize the RDMAContext for return path for postcopy after first
3392      * connection request reached.
3393      */
3394     if ((migrate_postcopy() || migrate_return_path())
3395         && !rdma->is_return_path) {
3396         rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
3397         if (rdma_return_path == NULL) {
3398             rdma_ack_cm_event(cm_event);
3399             goto err_rdma_dest_wait;
3400         }
3401
3402         qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
3403     }
3404
3405     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3406
3407     network_to_caps(&cap);
3408
3409     if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3410         error_report("Unknown source RDMA version: %d, bailing...",
3411                      cap.version);
3412         rdma_ack_cm_event(cm_event);
3413         goto err_rdma_dest_wait;
3414     }
3415
3416     /*
3417      * Respond with only the capabilities this version of QEMU knows about.
3418      */
3419     cap.flags &= known_capabilities;
3420
3421     /*
3422      * Enable the ones that we do know about.
3423      * Add other checks here as new ones are introduced.
3424      */
3425     if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3426         rdma->pin_all = true;
3427     }
3428
3429     rdma->cm_id = cm_event->id;
3430     verbs = cm_event->id->verbs;
3431
3432     rdma_ack_cm_event(cm_event);
3433
3434     trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3435
3436     caps_to_network(&cap);
3437
3438     trace_qemu_rdma_accept_pin_verbsc(verbs);
3439
3440     if (!rdma->verbs) {
3441         rdma->verbs = verbs;
3442     } else if (rdma->verbs != verbs) {
3443         error_report("ibv context not matching %p, %p!", rdma->verbs,
3444                      verbs);
3445         goto err_rdma_dest_wait;
3446     }
3447
3448     qemu_rdma_dump_id("dest_init", verbs);
3449
3450     ret = qemu_rdma_alloc_pd_cq(rdma);
3451     if (ret < 0) {
3452         error_report("rdma migration: error allocating pd and cq!");
3453         goto err_rdma_dest_wait;
3454     }
3455
3456     ret = qemu_rdma_alloc_qp(rdma);
3457     if (ret < 0) {
3458         error_report("rdma migration: error allocating qp!");
3459         goto err_rdma_dest_wait;
3460     }
3461
3462     qemu_rdma_init_ram_blocks(rdma);
3463
3464     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3465         ret = qemu_rdma_reg_control(rdma, idx);
3466         if (ret < 0) {
3467             error_report("rdma: error registering %d control", idx);
3468             goto err_rdma_dest_wait;
3469         }
3470     }
3471
3472     /* Accept the second connection request for return path */
3473     if ((migrate_postcopy() || migrate_return_path())
3474         && !rdma->is_return_path) {
3475         qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3476                             NULL,
3477                             (void *)(intptr_t)rdma->return_path);
3478     } else {
3479         qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3480                             NULL, rdma);
3481     }
3482
3483     ret = rdma_accept(rdma->cm_id, &conn_param);
3484     if (ret < 0) {
3485         error_report("rdma_accept failed");
3486         goto err_rdma_dest_wait;
3487     }
3488
3489     ret = rdma_get_cm_event(rdma->channel, &cm_event);
3490     if (ret < 0) {
3491         error_report("rdma_accept get_cm_event failed");
3492         goto err_rdma_dest_wait;
3493     }
3494
3495     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3496         error_report("rdma_accept not event established");
3497         rdma_ack_cm_event(cm_event);
3498         goto err_rdma_dest_wait;
3499     }
3500
3501     rdma_ack_cm_event(cm_event);
3502     rdma->connected = true;
3503
3504     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3505     if (ret < 0) {
3506         error_report("rdma migration: error posting second control recv");
3507         goto err_rdma_dest_wait;
3508     }
3509
3510     qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3511
3512     return 0;
3513
3514 err_rdma_dest_wait:
3515     rdma->errored = true;
3516     qemu_rdma_cleanup(rdma);
3517     g_free(rdma_return_path);
3518     return -1;
3519 }
3520
3521 static int dest_ram_sort_func(const void *a, const void *b)
3522 {
3523     unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3524     unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3525
3526     return (a_index < b_index) ? -1 : (a_index != b_index);
3527 }
3528
3529 /*
3530  * During each iteration of the migration, we listen for instructions
3531  * by the source VM to perform dynamic page registrations before they
3532  * can perform RDMA operations.
3533  *
3534  * We respond with the 'rkey'.
3535  *
3536  * Keep doing this until the source tells us to stop.
3537  */
3538 static int qemu_rdma_registration_handle(QEMUFile *f)
3539 {
3540     RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3541                                .type = RDMA_CONTROL_REGISTER_RESULT,
3542                                .repeat = 0,
3543                              };
3544     RDMAControlHeader unreg_resp = { .len = 0,
3545                                .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3546                                .repeat = 0,
3547                              };
3548     RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3549                                  .repeat = 1 };
3550     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3551     RDMAContext *rdma;
3552     RDMALocalBlocks *local;
3553     RDMAControlHeader head;
3554     RDMARegister *reg, *registers;
3555     RDMACompress *comp;
3556     RDMARegisterResult *reg_result;
3557     static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3558     RDMALocalBlock *block;
3559     void *host_addr;
3560     int ret;
3561     int idx = 0;
3562     int count = 0;
3563     int i = 0;
3564
3565     RCU_READ_LOCK_GUARD();
3566     rdma = qatomic_rcu_read(&rioc->rdmain);
3567
3568     if (!rdma) {
3569         return -1;
3570     }
3571
3572     if (rdma_errored(rdma)) {
3573         return -1;
3574     }
3575
3576     local = &rdma->local_ram_blocks;
3577     do {
3578         trace_qemu_rdma_registration_handle_wait();
3579
3580         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3581
3582         if (ret < 0) {
3583             break;
3584         }
3585
3586         if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3587             error_report("rdma: Too many requests in this message (%d)."
3588                             "Bailing.", head.repeat);
3589             break;
3590         }
3591
3592         switch (head.type) {
3593         case RDMA_CONTROL_COMPRESS:
3594             comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3595             network_to_compress(comp);
3596
3597             trace_qemu_rdma_registration_handle_compress(comp->length,
3598                                                          comp->block_idx,
3599                                                          comp->offset);
3600             if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3601                 error_report("rdma: 'compress' bad block index %u (vs %d)",
3602                              (unsigned int)comp->block_idx,
3603                              rdma->local_ram_blocks.nb_blocks);
3604                 goto err;
3605             }
3606             block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3607
3608             host_addr = block->local_host_addr +
3609                             (comp->offset - block->offset);
3610
3611             ram_handle_compressed(host_addr, comp->value, comp->length);
3612             break;
3613
3614         case RDMA_CONTROL_REGISTER_FINISHED:
3615             trace_qemu_rdma_registration_handle_finished();
3616             return 0;
3617
3618         case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3619             trace_qemu_rdma_registration_handle_ram_blocks();
3620
3621             /* Sort our local RAM Block list so it's the same as the source,
3622              * we can do this since we've filled in a src_index in the list
3623              * as we received the RAMBlock list earlier.
3624              */
3625             qsort(rdma->local_ram_blocks.block,
3626                   rdma->local_ram_blocks.nb_blocks,
3627                   sizeof(RDMALocalBlock), dest_ram_sort_func);
3628             for (i = 0; i < local->nb_blocks; i++) {
3629                 local->block[i].index = i;
3630             }
3631
3632             if (rdma->pin_all) {
3633                 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3634                 if (ret < 0) {
3635                     error_report("rdma migration: error dest "
3636                                     "registering ram blocks");
3637                     goto err;
3638                 }
3639             }
3640
3641             /*
3642              * Dest uses this to prepare to transmit the RAMBlock descriptions
3643              * to the source VM after connection setup.
3644              * Both sides use the "remote" structure to communicate and update
3645              * their "local" descriptions with what was sent.
3646              */
3647             for (i = 0; i < local->nb_blocks; i++) {
3648                 rdma->dest_blocks[i].remote_host_addr =
3649                     (uintptr_t)(local->block[i].local_host_addr);
3650
3651                 if (rdma->pin_all) {
3652                     rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3653                 }
3654
3655                 rdma->dest_blocks[i].offset = local->block[i].offset;
3656                 rdma->dest_blocks[i].length = local->block[i].length;
3657
3658                 dest_block_to_network(&rdma->dest_blocks[i]);
3659                 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3660                     local->block[i].block_name,
3661                     local->block[i].offset,
3662                     local->block[i].length,
3663                     local->block[i].local_host_addr,
3664                     local->block[i].src_index);
3665             }
3666
3667             blocks.len = rdma->local_ram_blocks.nb_blocks
3668                                                 * sizeof(RDMADestBlock);
3669
3670
3671             ret = qemu_rdma_post_send_control(rdma,
3672                                         (uint8_t *) rdma->dest_blocks, &blocks);
3673
3674             if (ret < 0) {
3675                 error_report("rdma migration: error sending remote info");
3676                 goto err;
3677             }
3678
3679             break;
3680         case RDMA_CONTROL_REGISTER_REQUEST:
3681             trace_qemu_rdma_registration_handle_register(head.repeat);
3682
3683             reg_resp.repeat = head.repeat;
3684             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3685
3686             for (count = 0; count < head.repeat; count++) {
3687                 uint64_t chunk;
3688                 uint8_t *chunk_start, *chunk_end;
3689
3690                 reg = &registers[count];
3691                 network_to_register(reg);
3692
3693                 reg_result = &results[count];
3694
3695                 trace_qemu_rdma_registration_handle_register_loop(count,
3696                          reg->current_index, reg->key.current_addr, reg->chunks);
3697
3698                 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3699                     error_report("rdma: 'register' bad block index %u (vs %d)",
3700                                  (unsigned int)reg->current_index,
3701                                  rdma->local_ram_blocks.nb_blocks);
3702                     goto err;
3703                 }
3704                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3705                 if (block->is_ram_block) {
3706                     if (block->offset > reg->key.current_addr) {
3707                         error_report("rdma: bad register address for block %s"
3708                             " offset: %" PRIx64 " current_addr: %" PRIx64,
3709                             block->block_name, block->offset,
3710                             reg->key.current_addr);
3711                         goto err;
3712                     }
3713                     host_addr = (block->local_host_addr +
3714                                 (reg->key.current_addr - block->offset));
3715                     chunk = ram_chunk_index(block->local_host_addr,
3716                                             (uint8_t *) host_addr);
3717                 } else {
3718                     chunk = reg->key.chunk;
3719                     host_addr = block->local_host_addr +
3720                         (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3721                     /* Check for particularly bad chunk value */
3722                     if (host_addr < (void *)block->local_host_addr) {
3723                         error_report("rdma: bad chunk for block %s"
3724                             " chunk: %" PRIx64,
3725                             block->block_name, reg->key.chunk);
3726                         goto err;
3727                     }
3728                 }
3729                 chunk_start = ram_chunk_start(block, chunk);
3730                 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3731                 /* avoid "-Waddress-of-packed-member" warning */
3732                 uint32_t tmp_rkey = 0;
3733                 if (qemu_rdma_register_and_get_keys(rdma, block,
3734                             (uintptr_t)host_addr, NULL, &tmp_rkey,
3735                             chunk, chunk_start, chunk_end)) {
3736                     error_report("cannot get rkey");
3737                     goto err;
3738                 }
3739                 reg_result->rkey = tmp_rkey;
3740
3741                 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3742
3743                 trace_qemu_rdma_registration_handle_register_rkey(
3744                                                            reg_result->rkey);
3745
3746                 result_to_network(reg_result);
3747             }
3748
3749             ret = qemu_rdma_post_send_control(rdma,
3750                             (uint8_t *) results, &reg_resp);
3751
3752             if (ret < 0) {
3753                 error_report("Failed to send control buffer");
3754                 goto err;
3755             }
3756             break;
3757         case RDMA_CONTROL_UNREGISTER_REQUEST:
3758             trace_qemu_rdma_registration_handle_unregister(head.repeat);
3759             unreg_resp.repeat = head.repeat;
3760             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3761
3762             for (count = 0; count < head.repeat; count++) {
3763                 reg = &registers[count];
3764                 network_to_register(reg);
3765
3766                 trace_qemu_rdma_registration_handle_unregister_loop(count,
3767                            reg->current_index, reg->key.chunk);
3768
3769                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3770
3771                 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3772                 block->pmr[reg->key.chunk] = NULL;
3773
3774                 if (ret != 0) {
3775                     perror("rdma unregistration chunk failed");
3776                     goto err;
3777                 }
3778
3779                 rdma->total_registrations--;
3780
3781                 trace_qemu_rdma_registration_handle_unregister_success(
3782                                                        reg->key.chunk);
3783             }
3784
3785             ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3786
3787             if (ret < 0) {
3788                 error_report("Failed to send control buffer");
3789                 goto err;
3790             }
3791             break;
3792         case RDMA_CONTROL_REGISTER_RESULT:
3793             error_report("Invalid RESULT message at dest.");
3794             goto err;
3795         default:
3796             error_report("Unknown control message %s", control_desc(head.type));
3797             goto err;
3798         }
3799     } while (1);
3800
3801 err:
3802     rdma->errored = true;
3803     return -1;
3804 }
3805
3806 /* Destination:
3807  * Called via a ram_control_load_hook during the initial RAM load section which
3808  * lists the RAMBlocks by name.  This lets us know the order of the RAMBlocks
3809  * on the source.
3810  * We've already built our local RAMBlock list, but not yet sent the list to
3811  * the source.
3812  */
3813 static int
3814 rdma_block_notification_handle(QEMUFile *f, const char *name)
3815 {
3816     RDMAContext *rdma;
3817     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3818     int curr;
3819     int found = -1;
3820
3821     RCU_READ_LOCK_GUARD();
3822     rdma = qatomic_rcu_read(&rioc->rdmain);
3823
3824     if (!rdma) {
3825         return -1;
3826     }
3827
3828     /* Find the matching RAMBlock in our local list */
3829     for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3830         if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3831             found = curr;
3832             break;
3833         }
3834     }
3835
3836     if (found == -1) {
3837         error_report("RAMBlock '%s' not found on destination", name);
3838         return -1;
3839     }
3840
3841     rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3842     trace_rdma_block_notification_handle(name, rdma->next_src_index);
3843     rdma->next_src_index++;
3844
3845     return 0;
3846 }
3847
3848 static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data)
3849 {
3850     switch (flags) {
3851     case RAM_CONTROL_BLOCK_REG:
3852         return rdma_block_notification_handle(f, data);
3853
3854     case RAM_CONTROL_HOOK:
3855         return qemu_rdma_registration_handle(f);
3856
3857     default:
3858         /* Shouldn't be called with any other values */
3859         abort();
3860     }
3861 }
3862
3863 static int qemu_rdma_registration_start(QEMUFile *f,
3864                                         uint64_t flags, void *data)
3865 {
3866     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3867     RDMAContext *rdma;
3868
3869     if (migration_in_postcopy()) {
3870         return 0;
3871     }
3872
3873     RCU_READ_LOCK_GUARD();
3874     rdma = qatomic_rcu_read(&rioc->rdmaout);
3875     if (!rdma) {
3876         return -1;
3877     }
3878
3879     if (rdma_errored(rdma)) {
3880         return -1;
3881     }
3882
3883     trace_qemu_rdma_registration_start(flags);
3884     qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3885     qemu_fflush(f);
3886
3887     return 0;
3888 }
3889
3890 /*
3891  * Inform dest that dynamic registrations are done for now.
3892  * First, flush writes, if any.
3893  */
3894 static int qemu_rdma_registration_stop(QEMUFile *f,
3895                                        uint64_t flags, void *data)
3896 {
3897     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3898     RDMAContext *rdma;
3899     RDMAControlHeader head = { .len = 0, .repeat = 1 };
3900     int ret;
3901
3902     if (migration_in_postcopy()) {
3903         return 0;
3904     }
3905
3906     RCU_READ_LOCK_GUARD();
3907     rdma = qatomic_rcu_read(&rioc->rdmaout);
3908     if (!rdma) {
3909         return -1;
3910     }
3911
3912     if (rdma_errored(rdma)) {
3913         return -1;
3914     }
3915
3916     qemu_fflush(f);
3917     ret = qemu_rdma_drain_cq(rdma);
3918
3919     if (ret < 0) {
3920         goto err;
3921     }
3922
3923     if (flags == RAM_CONTROL_SETUP) {
3924         RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3925         RDMALocalBlocks *local = &rdma->local_ram_blocks;
3926         int reg_result_idx, i, nb_dest_blocks;
3927
3928         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3929         trace_qemu_rdma_registration_stop_ram();
3930
3931         /*
3932          * Make sure that we parallelize the pinning on both sides.
3933          * For very large guests, doing this serially takes a really
3934          * long time, so we have to 'interleave' the pinning locally
3935          * with the control messages by performing the pinning on this
3936          * side before we receive the control response from the other
3937          * side that the pinning has completed.
3938          */
3939         ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3940                     &reg_result_idx, rdma->pin_all ?
3941                     qemu_rdma_reg_whole_ram_blocks : NULL);
3942         if (ret < 0) {
3943             fprintf(stderr, "receiving remote info!");
3944             return -1;
3945         }
3946
3947         nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3948
3949         /*
3950          * The protocol uses two different sets of rkeys (mutually exclusive):
3951          * 1. One key to represent the virtual address of the entire ram block.
3952          *    (dynamic chunk registration disabled - pin everything with one rkey.)
3953          * 2. One to represent individual chunks within a ram block.
3954          *    (dynamic chunk registration enabled - pin individual chunks.)
3955          *
3956          * Once the capability is successfully negotiated, the destination transmits
3957          * the keys to use (or sends them later) including the virtual addresses
3958          * and then propagates the remote ram block descriptions to his local copy.
3959          */
3960
3961         if (local->nb_blocks != nb_dest_blocks) {
3962             fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
3963                     "Your QEMU command line parameters are probably "
3964                     "not identical on both the source and destination.",
3965                     local->nb_blocks, nb_dest_blocks);
3966             rdma->errored = true;
3967             return -1;
3968         }
3969
3970         qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3971         memcpy(rdma->dest_blocks,
3972             rdma->wr_data[reg_result_idx].control_curr, resp.len);
3973         for (i = 0; i < nb_dest_blocks; i++) {
3974             network_to_dest_block(&rdma->dest_blocks[i]);
3975
3976             /* We require that the blocks are in the same order */
3977             if (rdma->dest_blocks[i].length != local->block[i].length) {
3978                 fprintf(stderr, "Block %s/%d has a different length %" PRIu64
3979                         "vs %" PRIu64, local->block[i].block_name, i,
3980                         local->block[i].length,
3981                         rdma->dest_blocks[i].length);
3982                 rdma->errored = true;
3983                 return -1;
3984             }
3985             local->block[i].remote_host_addr =
3986                     rdma->dest_blocks[i].remote_host_addr;
3987             local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3988         }
3989     }
3990
3991     trace_qemu_rdma_registration_stop(flags);
3992
3993     head.type = RDMA_CONTROL_REGISTER_FINISHED;
3994     ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3995
3996     if (ret < 0) {
3997         goto err;
3998     }
3999
4000     return 0;
4001 err:
4002     rdma->errored = true;
4003     return -1;
4004 }
4005
4006 static const QEMUFileHooks rdma_read_hooks = {
4007     .hook_ram_load = rdma_load_hook,
4008 };
4009
4010 static const QEMUFileHooks rdma_write_hooks = {
4011     .before_ram_iterate = qemu_rdma_registration_start,
4012     .after_ram_iterate  = qemu_rdma_registration_stop,
4013     .save_page          = qemu_rdma_save_page,
4014 };
4015
4016
4017 static void qio_channel_rdma_finalize(Object *obj)
4018 {
4019     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
4020     if (rioc->rdmain) {
4021         qemu_rdma_cleanup(rioc->rdmain);
4022         g_free(rioc->rdmain);
4023         rioc->rdmain = NULL;
4024     }
4025     if (rioc->rdmaout) {
4026         qemu_rdma_cleanup(rioc->rdmaout);
4027         g_free(rioc->rdmaout);
4028         rioc->rdmaout = NULL;
4029     }
4030 }
4031
4032 static void qio_channel_rdma_class_init(ObjectClass *klass,
4033                                         void *class_data G_GNUC_UNUSED)
4034 {
4035     QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
4036
4037     ioc_klass->io_writev = qio_channel_rdma_writev;
4038     ioc_klass->io_readv = qio_channel_rdma_readv;
4039     ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
4040     ioc_klass->io_close = qio_channel_rdma_close;
4041     ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
4042     ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
4043     ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
4044 }
4045
4046 static const TypeInfo qio_channel_rdma_info = {
4047     .parent = TYPE_QIO_CHANNEL,
4048     .name = TYPE_QIO_CHANNEL_RDMA,
4049     .instance_size = sizeof(QIOChannelRDMA),
4050     .instance_finalize = qio_channel_rdma_finalize,
4051     .class_init = qio_channel_rdma_class_init,
4052 };
4053
4054 static void qio_channel_rdma_register_types(void)
4055 {
4056     type_register_static(&qio_channel_rdma_info);
4057 }
4058
4059 type_init(qio_channel_rdma_register_types);
4060
4061 static QEMUFile *rdma_new_input(RDMAContext *rdma)
4062 {
4063     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4064
4065     rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
4066     rioc->rdmain = rdma;
4067     rioc->rdmaout = rdma->return_path;
4068     qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
4069
4070     return rioc->file;
4071 }
4072
4073 static QEMUFile *rdma_new_output(RDMAContext *rdma)
4074 {
4075     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4076
4077     rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
4078     rioc->rdmaout = rdma;
4079     rioc->rdmain = rdma->return_path;
4080     qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
4081
4082     return rioc->file;
4083 }
4084
4085 static void rdma_accept_incoming_migration(void *opaque)
4086 {
4087     RDMAContext *rdma = opaque;
4088     int ret;
4089     QEMUFile *f;
4090     Error *local_err = NULL;
4091
4092     trace_qemu_rdma_accept_incoming_migration();
4093     ret = qemu_rdma_accept(rdma);
4094
4095     if (ret < 0) {
4096         fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
4097         return;
4098     }
4099
4100     trace_qemu_rdma_accept_incoming_migration_accepted();
4101
4102     if (rdma->is_return_path) {
4103         return;
4104     }
4105
4106     f = rdma_new_input(rdma);
4107     if (f == NULL) {
4108         fprintf(stderr, "RDMA ERROR: could not open RDMA for input\n");
4109         qemu_rdma_cleanup(rdma);
4110         return;
4111     }
4112
4113     rdma->migration_started_on_destination = 1;
4114     migration_fd_process_incoming(f, &local_err);
4115     if (local_err) {
4116         error_reportf_err(local_err, "RDMA ERROR:");
4117     }
4118 }
4119
4120 void rdma_start_incoming_migration(const char *host_port, Error **errp)
4121 {
4122     int ret;
4123     RDMAContext *rdma;
4124
4125     trace_rdma_start_incoming_migration();
4126
4127     /* Avoid ram_block_discard_disable(), cannot change during migration. */
4128     if (ram_block_discard_is_required()) {
4129         error_setg(errp, "RDMA: cannot disable RAM discard");
4130         return;
4131     }
4132
4133     rdma = qemu_rdma_data_init(host_port, errp);
4134     if (rdma == NULL) {
4135         goto err;
4136     }
4137
4138     ret = qemu_rdma_dest_init(rdma, errp);
4139     if (ret < 0) {
4140         goto err;
4141     }
4142
4143     trace_rdma_start_incoming_migration_after_dest_init();
4144
4145     ret = rdma_listen(rdma->listen_id, 5);
4146
4147     if (ret < 0) {
4148         ERROR(errp, "listening on socket!");
4149         goto cleanup_rdma;
4150     }
4151
4152     trace_rdma_start_incoming_migration_after_rdma_listen();
4153
4154     qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4155                         NULL, (void *)(intptr_t)rdma);
4156     return;
4157
4158 cleanup_rdma:
4159     qemu_rdma_cleanup(rdma);
4160 err:
4161     if (rdma) {
4162         g_free(rdma->host);
4163         g_free(rdma->host_port);
4164     }
4165     g_free(rdma);
4166 }
4167
4168 void rdma_start_outgoing_migration(void *opaque,
4169                             const char *host_port, Error **errp)
4170 {
4171     MigrationState *s = opaque;
4172     RDMAContext *rdma_return_path = NULL;
4173     RDMAContext *rdma;
4174     int ret;
4175
4176     /* Avoid ram_block_discard_disable(), cannot change during migration. */
4177     if (ram_block_discard_is_required()) {
4178         error_setg(errp, "RDMA: cannot disable RAM discard");
4179         return;
4180     }
4181
4182     rdma = qemu_rdma_data_init(host_port, errp);
4183     if (rdma == NULL) {
4184         goto err;
4185     }
4186
4187     ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
4188
4189     if (ret < 0) {
4190         goto err;
4191     }
4192
4193     trace_rdma_start_outgoing_migration_after_rdma_source_init();
4194     ret = qemu_rdma_connect(rdma, false, errp);
4195
4196     if (ret < 0) {
4197         goto err;
4198     }
4199
4200     /* RDMA postcopy need a separate queue pair for return path */
4201     if (migrate_postcopy() || migrate_return_path()) {
4202         rdma_return_path = qemu_rdma_data_init(host_port, errp);
4203
4204         if (rdma_return_path == NULL) {
4205             goto return_path_err;
4206         }
4207
4208         ret = qemu_rdma_source_init(rdma_return_path,
4209                                     migrate_rdma_pin_all(), errp);
4210
4211         if (ret < 0) {
4212             goto return_path_err;
4213         }
4214
4215         ret = qemu_rdma_connect(rdma_return_path, true, errp);
4216
4217         if (ret < 0) {
4218             goto return_path_err;
4219         }
4220
4221         rdma->return_path = rdma_return_path;
4222         rdma_return_path->return_path = rdma;
4223         rdma_return_path->is_return_path = true;
4224     }
4225
4226     trace_rdma_start_outgoing_migration_after_rdma_connect();
4227
4228     s->to_dst_file = rdma_new_output(rdma);
4229     migrate_fd_connect(s, NULL);
4230     return;
4231 return_path_err:
4232     qemu_rdma_cleanup(rdma);
4233 err:
4234     g_free(rdma);
4235     g_free(rdma_return_path);
4236 }