migration-rdma.c

   1 /*
   2  * RDMA protocol and interfaces
   3  *
   4  * Copyright IBM, Corp. 2010-2013
   5  *
   6  * Authors:
   7  *  Michael R. Hines <mrhines@us.ibm.com>
   8  *  Jiuxing Liu <jl@us.ibm.com>
   9  *
  10  * This work is licensed under the terms of the GNU GPL, version 2 or
  11  * later.  See the COPYING file in the top-level directory.
  12  *
  13  */
  14 #include "qemu-common.h"
  15 #include "migration/migration.h"
  16 #include "migration/qemu-file.h"
  17 #include "exec/cpu-common.h"
  18 #include "qemu/main-loop.h"
  19 #include "qemu/sockets.h"
  20 #include "qemu/bitmap.h"
  21 #include "block/coroutine.h"
  22 #include <stdio.h>
  23 #include <sys/types.h>
  24 #include <sys/socket.h>
  25 #include <netdb.h>
  26 #include <arpa/inet.h>
  27 #include <string.h>
  28 #include <rdma/rdma_cma.h>
  29
  30 #define DEBUG_RDMA
  31 //#define DEBUG_RDMA_VERBOSE
  32 //#define DEBUG_RDMA_REALLY_VERBOSE
  33
  34 #ifdef DEBUG_RDMA
  35 #define DPRINTF(fmt, ...) \
  36     do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
  37 #else
  38 #define DPRINTF(fmt, ...) \
  39     do { } while (0)
  40 #endif
  41
  42 #ifdef DEBUG_RDMA_VERBOSE
  43 #define DDPRINTF(fmt, ...) \
  44     do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
  45 #else
  46 #define DDPRINTF(fmt, ...) \
  47     do { } while (0)
  48 #endif
  49
  50 #ifdef DEBUG_RDMA_REALLY_VERBOSE
  51 #define DDDPRINTF(fmt, ...) \
  52     do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
  53 #else
  54 #define DDDPRINTF(fmt, ...) \
  55     do { } while (0)
  56 #endif
  57
  58 /*
  59  * Print and error on both the Monitor and the Log file.
  60  */
  61 #define ERROR(errp, fmt, ...) \
  62     do { \
  63         fprintf(stderr, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
  64         if (errp && (*(errp) == NULL)) { \
  65             error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
  66         } \
  67     } while (0)
  68
  69 #define RDMA_RESOLVE_TIMEOUT_MS 10000
  70
  71 /* Do not merge data if larger than this. */
  72 #define RDMA_MERGE_MAX (2 * 1024 * 1024)
  73 #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
  74
  75 #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
  76
  77 /*
  78  * This is only for non-live state being migrated.
  79  * Instead of RDMA_WRITE messages, we use RDMA_SEND
  80  * messages for that state, which requires a different
  81  * delivery design than main memory.
  82  */
  83 #define RDMA_SEND_INCREMENT 32768
  84
  85 /*
  86  * Maximum size infiniband SEND message
  87  */
  88 #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
  89 #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
  90
  91 #define RDMA_CONTROL_VERSION_CURRENT 1
  92 /*
  93  * Capabilities for negotiation.
  94  */
  95 #define RDMA_CAPABILITY_PIN_ALL 0x01
  96
  97 /*
  98  * Add the other flags above to this list of known capabilities
  99  * as they are introduced.
 100  */
 101 static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
 102
 103 #define CHECK_ERROR_STATE() \
 104     do { \
 105         if (rdma->error_state) { \
 106             if (!rdma->error_reported) { \
 107                 fprintf(stderr, "RDMA is in an error state waiting migration" \
 108                                 " to abort!\n"); \
 109                 rdma->error_reported = 1; \
 110             } \
 111             return rdma->error_state; \
 112         } \
 113     } while (0);
 114
 115 /*
 116  * A work request ID is 64-bits and we split up these bits
 117  * into 3 parts:
 118  *
 119  * bits 0-15 : type of control message, 2^16
 120  * bits 16-29: ram block index, 2^14
 121  * bits 30-63: ram block chunk number, 2^34
 122  *
 123  * The last two bit ranges are only used for RDMA writes,
 124  * in order to track their completion and potentially
 125  * also track unregistration status of the message.
 126  */
 127 #define RDMA_WRID_TYPE_SHIFT  0UL
 128 #define RDMA_WRID_BLOCK_SHIFT 16UL
 129 #define RDMA_WRID_CHUNK_SHIFT 30UL
 130
 131 #define RDMA_WRID_TYPE_MASK \
 132     ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
 133
 134 #define RDMA_WRID_BLOCK_MASK \
 135     (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
 136
 137 #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
 138
 139 /*
 140  * RDMA migration protocol:
 141  * 1. RDMA Writes (data messages, i.e. RAM)
 142  * 2. IB Send/Recv (control channel messages)
 143  */
 144 enum {
 145     RDMA_WRID_NONE = 0,
 146     RDMA_WRID_RDMA_WRITE = 1,
 147     RDMA_WRID_SEND_CONTROL = 2000,
 148     RDMA_WRID_RECV_CONTROL = 4000,
 149 };
 150
 151 const char *wrid_desc[] = {
 152     [RDMA_WRID_NONE] = "NONE",
 153     [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
 154     [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
 155     [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
 156 };
 157
 158 /*
 159  * Work request IDs for IB SEND messages only (not RDMA writes).
 160  * This is used by the migration protocol to transmit
 161  * control messages (such as device state and registration commands)
 162  *
 163  * We could use more WRs, but we have enough for now.
 164  */
 165 enum {
 166     RDMA_WRID_READY = 0,
 167     RDMA_WRID_DATA,
 168     RDMA_WRID_CONTROL,
 169     RDMA_WRID_MAX,
 170 };
 171
 172 /*
 173  * SEND/RECV IB Control Messages.
 174  */
 175 enum {
 176     RDMA_CONTROL_NONE = 0,
 177     RDMA_CONTROL_ERROR,
 178     RDMA_CONTROL_READY,               /* ready to receive */
 179     RDMA_CONTROL_QEMU_FILE,           /* QEMUFile-transmitted bytes */
 180     RDMA_CONTROL_RAM_BLOCKS_REQUEST,  /* RAMBlock synchronization */
 181     RDMA_CONTROL_RAM_BLOCKS_RESULT,   /* RAMBlock synchronization */
 182     RDMA_CONTROL_COMPRESS,            /* page contains repeat values */
 183     RDMA_CONTROL_REGISTER_REQUEST,    /* dynamic page registration */
 184     RDMA_CONTROL_REGISTER_RESULT,     /* key to use after registration */
 185     RDMA_CONTROL_REGISTER_FINISHED,   /* current iteration finished */
 186     RDMA_CONTROL_UNREGISTER_REQUEST,  /* dynamic UN-registration */
 187     RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
 188 };
 189
 190 const char *control_desc[] = {
 191     [RDMA_CONTROL_NONE] = "NONE",
 192     [RDMA_CONTROL_ERROR] = "ERROR",
 193     [RDMA_CONTROL_READY] = "READY",
 194     [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
 195     [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
 196     [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
 197     [RDMA_CONTROL_COMPRESS] = "COMPRESS",
 198     [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
 199     [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
 200     [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
 201     [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
 202     [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
 203 };
 204
 205 /*
 206  * Memory and MR structures used to represent an IB Send/Recv work request.
 207  * This is *not* used for RDMA writes, only IB Send/Recv.
 208  */
 209 typedef struct {
 210     uint8_t  control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
 211     struct   ibv_mr *control_mr;               /* registration metadata */
 212     size_t   control_len;                      /* length of the message */
 213     uint8_t *control_curr;                     /* start of unconsumed bytes */
 214 } RDMAWorkRequestData;
 215
 216 /*
 217  * Negotiate RDMA capabilities during connection-setup time.
 218  */
 219 typedef struct {
 220     uint32_t version;
 221     uint32_t flags;
 222 } RDMACapabilities;
 223
 224 static void caps_to_network(RDMACapabilities *cap)
 225 {
 226     cap->version = htonl(cap->version);
 227     cap->flags = htonl(cap->flags);
 228 }
 229
 230 static void network_to_caps(RDMACapabilities *cap)
 231 {
 232     cap->version = ntohl(cap->version);
 233     cap->flags = ntohl(cap->flags);
 234 }
 235
 236 /*
 237  * Representation of a RAMBlock from an RDMA perspective.
 238  * This is not transmitted, only local.
 239  * This and subsequent structures cannot be linked lists
 240  * because we're using a single IB message to transmit
 241  * the information. It's small anyway, so a list is overkill.
 242  */
 243 typedef struct RDMALocalBlock {
 244     uint8_t  *local_host_addr; /* local virtual address */
 245     uint64_t remote_host_addr; /* remote virtual address */
 246     uint64_t offset;
 247     uint64_t length;
 248     struct   ibv_mr **pmr;     /* MRs for chunk-level registration */
 249     struct   ibv_mr *mr;       /* MR for non-chunk-level registration */
 250     uint32_t *remote_keys;     /* rkeys for chunk-level registration */
 251     uint32_t remote_rkey;      /* rkeys for non-chunk-level registration */
 252     int      index;            /* which block are we */
 253     bool     is_ram_block;
 254     int      nb_chunks;
 255     unsigned long *transit_bitmap;
 256     unsigned long *unregister_bitmap;
 257 } RDMALocalBlock;
 258
 259 /*
 260  * Also represents a RAMblock, but only on the dest.
 261  * This gets transmitted by the dest during connection-time
 262  * to the source VM and then is used to populate the
 263  * corresponding RDMALocalBlock with
 264  * the information needed to perform the actual RDMA.
 265  */
 266 typedef struct QEMU_PACKED RDMARemoteBlock {
 267     uint64_t remote_host_addr;
 268     uint64_t offset;
 269     uint64_t length;
 270     uint32_t remote_rkey;
 271     uint32_t padding;
 272 } RDMARemoteBlock;
 273
 274 static uint64_t htonll(uint64_t v)
 275 {
 276     union { uint32_t lv[2]; uint64_t llv; } u;
 277     u.lv[0] = htonl(v >> 32);
 278     u.lv[1] = htonl(v & 0xFFFFFFFFULL);
 279     return u.llv;
 280 }
 281
 282 static uint64_t ntohll(uint64_t v) {
 283     union { uint32_t lv[2]; uint64_t llv; } u;
 284     u.llv = v;
 285     return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
 286 }
 287
 288 static void remote_block_to_network(RDMARemoteBlock *rb)
 289 {
 290     rb->remote_host_addr = htonll(rb->remote_host_addr);
 291     rb->offset = htonll(rb->offset);
 292     rb->length = htonll(rb->length);
 293     rb->remote_rkey = htonl(rb->remote_rkey);
 294 }
 295
 296 static void network_to_remote_block(RDMARemoteBlock *rb)
 297 {
 298     rb->remote_host_addr = ntohll(rb->remote_host_addr);
 299     rb->offset = ntohll(rb->offset);
 300     rb->length = ntohll(rb->length);
 301     rb->remote_rkey = ntohl(rb->remote_rkey);
 302 }
 303
 304 /*
 305  * Virtual address of the above structures used for transmitting
 306  * the RAMBlock descriptions at connection-time.
 307  * This structure is *not* transmitted.
 308  */
 309 typedef struct RDMALocalBlocks {
 310     int nb_blocks;
 311     bool     init;             /* main memory init complete */
 312     RDMALocalBlock *block;
 313 } RDMALocalBlocks;
 314
 315 /*
 316  * Main data structure for RDMA state.
 317  * While there is only one copy of this structure being allocated right now,
 318  * this is the place where one would start if you wanted to consider
 319  * having more than one RDMA connection open at the same time.
 320  */
 321 typedef struct RDMAContext {
 322     char *host;
 323     int port;
 324
 325     RDMAWorkRequestData wr_data[RDMA_WRID_MAX + 1];
 326
 327     /*
 328      * This is used by *_exchange_send() to figure out whether or not
 329      * the initial "READY" message has already been received or not.
 330      * This is because other functions may potentially poll() and detect
 331      * the READY message before send() does, in which case we need to
 332      * know if it completed.
 333      */
 334     int control_ready_expected;
 335
 336     /* number of outstanding writes */
 337     int nb_sent;
 338
 339     /* store info about current buffer so that we can
 340        merge it with future sends */
 341     uint64_t current_addr;
 342     uint64_t current_length;
 343     /* index of ram block the current buffer belongs to */
 344     int current_index;
 345     /* index of the chunk in the current ram block */
 346     int current_chunk;
 347
 348     bool pin_all;
 349
 350     /*
 351      * infiniband-specific variables for opening the device
 352      * and maintaining connection state and so forth.
 353      *
 354      * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
 355      * cm_id->verbs, cm_id->channel, and cm_id->qp.
 356      */
 357     struct rdma_cm_id *cm_id;               /* connection manager ID */
 358     struct rdma_cm_id *listen_id;
 359
 360     struct ibv_context          *verbs;
 361     struct rdma_event_channel   *channel;
 362     struct ibv_qp *qp;                      /* queue pair */
 363     struct ibv_comp_channel *comp_channel;  /* completion channel */
 364     struct ibv_pd *pd;                      /* protection domain */
 365     struct ibv_cq *cq;                      /* completion queue */
 366
 367     /*
 368      * If a previous write failed (perhaps because of a failed
 369      * memory registration, then do not attempt any future work
 370      * and remember the error state.
 371      */
 372     int error_state;
 373     int error_reported;
 374
 375     /*
 376      * Description of ram blocks used throughout the code.
 377      */
 378     RDMALocalBlocks local_ram_blocks;
 379     RDMARemoteBlock *block;
 380
 381     /*
 382      * Migration on *destination* started.
 383      * Then use coroutine yield function.
 384      * Source runs in a thread, so we don't care.
 385      */
 386     int migration_started_on_destination;
 387
 388     int total_registrations;
 389     int total_writes;
 390
 391     int unregister_current, unregister_next;
 392     uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
 393
 394     GHashTable *blockmap;
 395     bool ipv6;
 396 } RDMAContext;
 397
 398 /*
 399  * Interface to the rest of the migration call stack.
 400  */
 401 typedef struct QEMUFileRDMA {
 402     RDMAContext *rdma;
 403     size_t len;
 404     void *file;
 405 } QEMUFileRDMA;
 406
 407 /*
 408  * Main structure for IB Send/Recv control messages.
 409  * This gets prepended at the beginning of every Send/Recv.
 410  */
 411 typedef struct QEMU_PACKED {
 412     uint32_t len;     /* Total length of data portion */
 413     uint32_t type;    /* which control command to perform */
 414     uint32_t repeat;  /* number of commands in data portion of same type */
 415     uint32_t padding;
 416 } RDMAControlHeader;
 417
 418 static void control_to_network(RDMAControlHeader *control)
 419 {
 420     control->type = htonl(control->type);
 421     control->len = htonl(control->len);
 422     control->repeat = htonl(control->repeat);
 423 }
 424
 425 static void network_to_control(RDMAControlHeader *control)
 426 {
 427     control->type = ntohl(control->type);
 428     control->len = ntohl(control->len);
 429     control->repeat = ntohl(control->repeat);
 430 }
 431
 432 /*
 433  * Register a single Chunk.
 434  * Information sent by the source VM to inform the dest
 435  * to register an single chunk of memory before we can perform
 436  * the actual RDMA operation.
 437  */
 438 typedef struct QEMU_PACKED {
 439     union QEMU_PACKED {
 440         uint64_t current_addr;  /* offset into the ramblock of the chunk */
 441         uint64_t chunk;         /* chunk to lookup if unregistering */
 442     } key;
 443     uint32_t current_index; /* which ramblock the chunk belongs to */
 444     uint32_t padding;
 445     uint64_t chunks;            /* how many sequential chunks to register */
 446 } RDMARegister;
 447
 448 static void register_to_network(RDMARegister *reg)
 449 {
 450     reg->key.current_addr = htonll(reg->key.current_addr);
 451     reg->current_index = htonl(reg->current_index);
 452     reg->chunks = htonll(reg->chunks);
 453 }
 454
 455 static void network_to_register(RDMARegister *reg)
 456 {
 457     reg->key.current_addr = ntohll(reg->key.current_addr);
 458     reg->current_index = ntohl(reg->current_index);
 459     reg->chunks = ntohll(reg->chunks);
 460 }
 461
 462 typedef struct QEMU_PACKED {
 463     uint32_t value;     /* if zero, we will madvise() */
 464     uint32_t block_idx; /* which ram block index */
 465     uint64_t offset;    /* where in the remote ramblock this chunk */
 466     uint64_t length;    /* length of the chunk */
 467 } RDMACompress;
 468
 469 static void compress_to_network(RDMACompress *comp)
 470 {
 471     comp->value = htonl(comp->value);
 472     comp->block_idx = htonl(comp->block_idx);
 473     comp->offset = htonll(comp->offset);
 474     comp->length = htonll(comp->length);
 475 }
 476
 477 static void network_to_compress(RDMACompress *comp)
 478 {
 479     comp->value = ntohl(comp->value);
 480     comp->block_idx = ntohl(comp->block_idx);
 481     comp->offset = ntohll(comp->offset);
 482     comp->length = ntohll(comp->length);
 483 }
 484
 485 /*
 486  * The result of the dest's memory registration produces an "rkey"
 487  * which the source VM must reference in order to perform
 488  * the RDMA operation.
 489  */
 490 typedef struct QEMU_PACKED {
 491     uint32_t rkey;
 492     uint32_t padding;
 493     uint64_t host_addr;
 494 } RDMARegisterResult;
 495
 496 static void result_to_network(RDMARegisterResult *result)
 497 {
 498     result->rkey = htonl(result->rkey);
 499     result->host_addr = htonll(result->host_addr);
 500 };
 501
 502 static void network_to_result(RDMARegisterResult *result)
 503 {
 504     result->rkey = ntohl(result->rkey);
 505     result->host_addr = ntohll(result->host_addr);
 506 };
 507
 508 const char *print_wrid(int wrid);
 509 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
 510                                    uint8_t *data, RDMAControlHeader *resp,
 511                                    int *resp_idx,
 512                                    int (*callback)(RDMAContext *rdma));
 513
 514 static inline uint64_t ram_chunk_index(uint8_t *start, uint8_t *host)
 515 {
 516     return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
 517 }
 518
 519 static inline uint8_t *ram_chunk_start(RDMALocalBlock *rdma_ram_block,
 520                                        uint64_t i)
 521 {
 522     return (uint8_t *) (((uintptr_t) rdma_ram_block->local_host_addr)
 523                                     + (i << RDMA_REG_CHUNK_SHIFT));
 524 }
 525
 526 static inline uint8_t *ram_chunk_end(RDMALocalBlock *rdma_ram_block, uint64_t i)
 527 {
 528     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
 529                                          (1UL << RDMA_REG_CHUNK_SHIFT);
 530
 531     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
 532         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
 533     }
 534
 535     return result;
 536 }
 537
 538 static int __qemu_rdma_add_block(RDMAContext *rdma, void *host_addr,
 539                          ram_addr_t block_offset, uint64_t length)
 540 {
 541     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 542     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
 543         (void *) block_offset);
 544     RDMALocalBlock *old = local->block;
 545
 546     assert(block == NULL);
 547
 548     local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1));
 549
 550     if (local->nb_blocks) {
 551         int x;
 552
 553         for (x = 0; x < local->nb_blocks; x++) {
 554             g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
 555             g_hash_table_insert(rdma->blockmap, (void *)old[x].offset,
 556                                                 &local->block[x]);
 557         }
 558         memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
 559         g_free(old);
 560     }
 561
 562     block = &local->block[local->nb_blocks];
 563
 564     block->local_host_addr = host_addr;
 565     block->offset = block_offset;
 566     block->length = length;
 567     block->index = local->nb_blocks;
 568     block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
 569     block->transit_bitmap = bitmap_new(block->nb_chunks);
 570     bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
 571     block->unregister_bitmap = bitmap_new(block->nb_chunks);
 572     bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
 573     block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t));
 574
 575     block->is_ram_block = local->init ? false : true;
 576
 577     g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
 578
 579     DDPRINTF("Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
 580            " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
 581             local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
 582             block->length, (uint64_t) (block->local_host_addr + block->length),
 583                 BITS_TO_LONGS(block->nb_chunks) *
 584                     sizeof(unsigned long) * 8, block->nb_chunks);
 585
 586     local->nb_blocks++;
 587
 588     return 0;
 589 }
 590
 591 /*
 592  * Memory regions need to be registered with the device and queue pairs setup
 593  * in advanced before the migration starts. This tells us where the RAM blocks
 594  * are so that we can register them individually.
 595  */
 596 static void qemu_rdma_init_one_block(void *host_addr,
 597     ram_addr_t block_offset, ram_addr_t length, void *opaque)
 598 {
 599     __qemu_rdma_add_block(opaque, host_addr, block_offset, length);
 600 }
 601
 602 /*
 603  * Identify the RAMBlocks and their quantity. They will be references to
 604  * identify chunk boundaries inside each RAMBlock and also be referenced
 605  * during dynamic page registration.
 606  */
 607 static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
 608 {
 609     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 610
 611     assert(rdma->blockmap == NULL);
 612     rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
 613     memset(local, 0, sizeof *local);
 614     qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
 615     DPRINTF("Allocated %d local ram block structures\n", local->nb_blocks);
 616     rdma->block = (RDMARemoteBlock *) g_malloc0(sizeof(RDMARemoteBlock) *
 617                         rdma->local_ram_blocks.nb_blocks);
 618     local->init = true;
 619     return 0;
 620 }
 621
 622 static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
 623 {
 624     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 625     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
 626         (void *) block_offset);
 627     RDMALocalBlock *old = local->block;
 628     int x;
 629
 630     assert(block);
 631
 632     if (block->pmr) {
 633         int j;
 634
 635         for (j = 0; j < block->nb_chunks; j++) {
 636             if (!block->pmr[j]) {
 637                 continue;
 638             }
 639             ibv_dereg_mr(block->pmr[j]);
 640             rdma->total_registrations--;
 641         }
 642         g_free(block->pmr);
 643         block->pmr = NULL;
 644     }
 645
 646     if (block->mr) {
 647         ibv_dereg_mr(block->mr);
 648         rdma->total_registrations--;
 649         block->mr = NULL;
 650     }
 651
 652     g_free(block->transit_bitmap);
 653     block->transit_bitmap = NULL;
 654
 655     g_free(block->unregister_bitmap);
 656     block->unregister_bitmap = NULL;
 657
 658     g_free(block->remote_keys);
 659     block->remote_keys = NULL;
 660
 661     for (x = 0; x < local->nb_blocks; x++) {
 662         g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
 663     }
 664
 665     if (local->nb_blocks > 1) {
 666
 667         local->block = g_malloc0(sizeof(RDMALocalBlock) *
 668                                     (local->nb_blocks - 1));
 669
 670         if (block->index) {
 671             memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
 672         }
 673
 674         if (block->index < (local->nb_blocks - 1)) {
 675             memcpy(local->block + block->index, old + (block->index + 1),
 676                 sizeof(RDMALocalBlock) *
 677                     (local->nb_blocks - (block->index + 1)));
 678         }
 679     } else {
 680         assert(block == local->block);
 681         local->block = NULL;
 682     }
 683
 684     DDPRINTF("Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
 685            " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
 686             local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
 687             block->length, (uint64_t) (block->local_host_addr + block->length),
 688                 BITS_TO_LONGS(block->nb_chunks) *
 689                     sizeof(unsigned long) * 8, block->nb_chunks);
 690
 691     g_free(old);
 692
 693     local->nb_blocks--;
 694
 695     if (local->nb_blocks) {
 696         for (x = 0; x < local->nb_blocks; x++) {
 697             g_hash_table_insert(rdma->blockmap, (void *)local->block[x].offset,
 698                                                 &local->block[x]);
 699         }
 700     }
 701
 702     return 0;
 703 }
 704
 705 /*
 706  * Put in the log file which RDMA device was opened and the details
 707  * associated with that device.
 708  */
 709 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
 710 {
 711     printf("%s RDMA Device opened: kernel name %s "
 712            "uverbs device name %s, "
 713            "infiniband_verbs class device path %s,"
 714            " infiniband class device path %s\n",
 715                 who,
 716                 verbs->device->name,
 717                 verbs->device->dev_name,
 718                 verbs->device->dev_path,
 719                 verbs->device->ibdev_path);
 720 }
 721
 722 /*
 723  * Put in the log file the RDMA gid addressing information,
 724  * useful for folks who have trouble understanding the
 725  * RDMA device hierarchy in the kernel.
 726  */
 727 static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
 728 {
 729     char sgid[33];
 730     char dgid[33];
 731     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
 732     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
 733     DPRINTF("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid);
 734 }
 735
 736 /*
 737  * Figure out which RDMA device corresponds to the requested IP hostname
 738  * Also create the initial connection manager identifiers for opening
 739  * the connection.
 740  */
 741 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
 742 {
 743     int ret;
 744     struct addrinfo *res;
 745     char port_str[16];
 746     struct rdma_cm_event *cm_event;
 747     char ip[40] = "unknown";
 748     int af = rdma->ipv6 ? PF_INET6 : PF_INET;
 749
 750     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
 751         ERROR(errp, "RDMA hostname has not been set\n");
 752         return -1;
 753     }
 754
 755     /* create CM channel */
 756     rdma->channel = rdma_create_event_channel();
 757     if (!rdma->channel) {
 758         ERROR(errp, "could not create CM channel\n");
 759         return -1;
 760     }
 761
 762     /* create CM id */
 763     ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
 764     if (ret) {
 765         ERROR(errp, "could not create channel id\n");
 766         goto err_resolve_create_id;
 767     }
 768
 769     snprintf(port_str, 16, "%d", rdma->port);
 770     port_str[15] = '\0';
 771
 772     ret = getaddrinfo(rdma->host, port_str, NULL, &res);
 773     if (ret < 0) {
 774         ERROR(errp, "could not getaddrinfo address %s\n", rdma->host);
 775         goto err_resolve_get_addr;
 776     }
 777
 778     inet_ntop(af, &((struct sockaddr_in *) res->ai_addr)->sin_addr,
 779                                 ip, sizeof ip);
 780     DPRINTF("%s => %s\n", rdma->host, ip);
 781
 782     /* resolve the first address */
 783     ret = rdma_resolve_addr(rdma->cm_id, NULL, res->ai_addr,
 784             RDMA_RESOLVE_TIMEOUT_MS);
 785     if (ret) {
 786         ERROR(errp, "could not resolve address %s\n", rdma->host);
 787         goto err_resolve_get_addr;
 788     }
 789
 790     qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
 791
 792     ret = rdma_get_cm_event(rdma->channel, &cm_event);
 793     if (ret) {
 794         ERROR(errp, "could not perform event_addr_resolved\n");
 795         goto err_resolve_get_addr;
 796     }
 797
 798     if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 799         ERROR(errp, "result not equal to event_addr_resolved %s\n",
 800                 rdma_event_str(cm_event->event));
 801         perror("rdma_resolve_addr");
 802         goto err_resolve_get_addr;
 803     }
 804     rdma_ack_cm_event(cm_event);
 805
 806     /* resolve route */
 807     ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
 808     if (ret) {
 809         ERROR(errp, "could not resolve rdma route\n");
 810         goto err_resolve_get_addr;
 811     }
 812
 813     ret = rdma_get_cm_event(rdma->channel, &cm_event);
 814     if (ret) {
 815         ERROR(errp, "could not perform event_route_resolved\n");
 816         goto err_resolve_get_addr;
 817     }
 818     if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
 819         ERROR(errp, "result not equal to event_route_resolved: %s\n",
 820                         rdma_event_str(cm_event->event));
 821         rdma_ack_cm_event(cm_event);
 822         goto err_resolve_get_addr;
 823     }
 824     rdma_ack_cm_event(cm_event);
 825     rdma->verbs = rdma->cm_id->verbs;
 826     qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
 827     qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
 828     return 0;
 829
 830 err_resolve_get_addr:
 831     rdma_destroy_id(rdma->cm_id);
 832     rdma->cm_id = NULL;
 833 err_resolve_create_id:
 834     rdma_destroy_event_channel(rdma->channel);
 835     rdma->channel = NULL;
 836
 837     return -1;
 838 }
 839
 840 /*
 841  * Create protection domain and completion queues
 842  */
 843 static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
 844 {
 845     /* allocate pd */
 846     rdma->pd = ibv_alloc_pd(rdma->verbs);
 847     if (!rdma->pd) {
 848         fprintf(stderr, "failed to allocate protection domain\n");
 849         return -1;
 850     }
 851
 852     /* create completion channel */
 853     rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
 854     if (!rdma->comp_channel) {
 855         fprintf(stderr, "failed to allocate completion channel\n");
 856         goto err_alloc_pd_cq;
 857     }
 858
 859     /*
 860      * Completion queue can be filled by both read and write work requests,
 861      * so must reflect the sum of both possible queue sizes.
 862      */
 863     rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
 864             NULL, rdma->comp_channel, 0);
 865     if (!rdma->cq) {
 866         fprintf(stderr, "failed to allocate completion queue\n");
 867         goto err_alloc_pd_cq;
 868     }
 869
 870     return 0;
 871
 872 err_alloc_pd_cq:
 873     if (rdma->pd) {
 874         ibv_dealloc_pd(rdma->pd);
 875     }
 876     if (rdma->comp_channel) {
 877         ibv_destroy_comp_channel(rdma->comp_channel);
 878     }
 879     rdma->pd = NULL;
 880     rdma->comp_channel = NULL;
 881     return -1;
 882
 883 }
 884
 885 /*
 886  * Create queue pairs.
 887  */
 888 static int qemu_rdma_alloc_qp(RDMAContext *rdma)
 889 {
 890     struct ibv_qp_init_attr attr = { 0 };
 891     int ret;
 892
 893     attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
 894     attr.cap.max_recv_wr = 3;
 895     attr.cap.max_send_sge = 1;
 896     attr.cap.max_recv_sge = 1;
 897     attr.send_cq = rdma->cq;
 898     attr.recv_cq = rdma->cq;
 899     attr.qp_type = IBV_QPT_RC;
 900
 901     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
 902     if (ret) {
 903         return -1;
 904     }
 905
 906     rdma->qp = rdma->cm_id->qp;
 907     return 0;
 908 }
 909
 910 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
 911 {
 912     int i;
 913     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 914
 915     for (i = 0; i < local->nb_blocks; i++) {
 916         local->block[i].mr =
 917             ibv_reg_mr(rdma->pd,
 918                     local->block[i].local_host_addr,
 919                     local->block[i].length,
 920                     IBV_ACCESS_LOCAL_WRITE |
 921                     IBV_ACCESS_REMOTE_WRITE
 922                     );
 923         if (!local->block[i].mr) {
 924             perror("Failed to register local dest ram block!\n");
 925             break;
 926         }
 927         rdma->total_registrations++;
 928     }
 929
 930     if (i >= local->nb_blocks) {
 931         return 0;
 932     }
 933
 934     for (i--; i >= 0; i--) {
 935         ibv_dereg_mr(local->block[i].mr);
 936         rdma->total_registrations--;
 937     }
 938
 939     return -1;
 940
 941 }
 942
 943 /*
 944  * Find the ram block that corresponds to the page requested to be
 945  * transmitted by QEMU.
 946  *
 947  * Once the block is found, also identify which 'chunk' within that
 948  * block that the page belongs to.
 949  *
 950  * This search cannot fail or the migration will fail.
 951  */
 952 static int qemu_rdma_search_ram_block(RDMAContext *rdma,
 953                                       uint64_t block_offset,
 954                                       uint64_t offset,
 955                                       uint64_t length,
 956                                       uint64_t *block_index,
 957                                       uint64_t *chunk_index)
 958 {
 959     uint64_t current_addr = block_offset + offset;
 960     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
 961                                                 (void *) block_offset);
 962     assert(block);
 963     assert(current_addr >= block->offset);
 964     assert((current_addr + length) <= (block->offset + block->length));
 965
 966     *block_index = block->index;
 967     *chunk_index = ram_chunk_index(block->local_host_addr,
 968                 block->local_host_addr + (current_addr - block->offset));
 969
 970     return 0;
 971 }
 972
 973 /*
 974  * Register a chunk with IB. If the chunk was already registered
 975  * previously, then skip.
 976  *
 977  * Also return the keys associated with the registration needed
 978  * to perform the actual RDMA operation.
 979  */
 980 static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
 981         RDMALocalBlock *block, uint8_t *host_addr,
 982         uint32_t *lkey, uint32_t *rkey, int chunk,
 983         uint8_t *chunk_start, uint8_t *chunk_end)
 984 {
 985     if (block->mr) {
 986         if (lkey) {
 987             *lkey = block->mr->lkey;
 988         }
 989         if (rkey) {
 990             *rkey = block->mr->rkey;
 991         }
 992         return 0;
 993     }
 994
 995     /* allocate memory to store chunk MRs */
 996     if (!block->pmr) {
 997         block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *));
 998         if (!block->pmr) {
 999             return -1;
1000         }
1001     }
1002
1003     /*
1004      * If 'rkey', then we're the destination, so grant access to the source.
1005      *
1006      * If 'lkey', then we're the source VM, so grant access only to ourselves.
1007      */
1008     if (!block->pmr[chunk]) {
1009         uint64_t len = chunk_end - chunk_start;
1010
1011         DDPRINTF("Registering %" PRIu64 " bytes @ %p\n",
1012                  len, chunk_start);
1013
1014         block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1015                 chunk_start, len,
1016                 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1017                         IBV_ACCESS_REMOTE_WRITE) : 0));
1018
1019         if (!block->pmr[chunk]) {
1020             perror("Failed to register chunk!");
1021             fprintf(stderr, "Chunk details: block: %d chunk index %d"
1022                             " start %" PRIu64 " end %" PRIu64 " host %" PRIu64
1023                             " local %" PRIu64 " registrations: %d\n",
1024                             block->index, chunk, (uint64_t) chunk_start,
1025                             (uint64_t) chunk_end, (uint64_t) host_addr,
1026                             (uint64_t) block->local_host_addr,
1027                             rdma->total_registrations);
1028             return -1;
1029         }
1030         rdma->total_registrations++;
1031     }
1032
1033     if (lkey) {
1034         *lkey = block->pmr[chunk]->lkey;
1035     }
1036     if (rkey) {
1037         *rkey = block->pmr[chunk]->rkey;
1038     }
1039     return 0;
1040 }
1041
1042 /*
1043  * Register (at connection time) the memory used for control
1044  * channel messages.
1045  */
1046 static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1047 {
1048     rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1049             rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1050             IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1051     if (rdma->wr_data[idx].control_mr) {
1052         rdma->total_registrations++;
1053         return 0;
1054     }
1055     fprintf(stderr, "qemu_rdma_reg_control failed!\n");
1056     return -1;
1057 }
1058
1059 const char *print_wrid(int wrid)
1060 {
1061     if (wrid >= RDMA_WRID_RECV_CONTROL) {
1062         return wrid_desc[RDMA_WRID_RECV_CONTROL];
1063     }
1064     return wrid_desc[wrid];
1065 }
1066
1067 /*
1068  * RDMA requires memory registration (mlock/pinning), but this is not good for
1069  * overcommitment.
1070  *
1071  * In preparation for the future where LRU information or workload-specific
1072  * writable writable working set memory access behavior is available to QEMU
1073  * it would be nice to have in place the ability to UN-register/UN-pin
1074  * particular memory regions from the RDMA hardware when it is determine that
1075  * those regions of memory will likely not be accessed again in the near future.
1076  *
1077  * While we do not yet have such information right now, the following
1078  * compile-time option allows us to perform a non-optimized version of this
1079  * behavior.
1080  *
1081  * By uncommenting this option, you will cause *all* RDMA transfers to be
1082  * unregistered immediately after the transfer completes on both sides of the
1083  * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
1084  *
1085  * This will have a terrible impact on migration performance, so until future
1086  * workload information or LRU information is available, do not attempt to use
1087  * this feature except for basic testing.
1088  */
1089 //#define RDMA_UNREGISTRATION_EXAMPLE
1090
1091 /*
1092  * Perform a non-optimized memory unregistration after every transfer
1093  * for demonsration purposes, only if pin-all is not requested.
1094  *
1095  * Potential optimizations:
1096  * 1. Start a new thread to run this function continuously
1097         - for bit clearing
1098         - and for receipt of unregister messages
1099  * 2. Use an LRU.
1100  * 3. Use workload hints.
1101  */
1102 static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1103 {
1104     while (rdma->unregistrations[rdma->unregister_current]) {
1105         int ret;
1106         uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1107         uint64_t chunk =
1108             (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1109         uint64_t index =
1110             (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1111         RDMALocalBlock *block =
1112             &(rdma->local_ram_blocks.block[index]);
1113         RDMARegister reg = { .current_index = index };
1114         RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1115                                  };
1116         RDMAControlHeader head = { .len = sizeof(RDMARegister),
1117                                    .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1118                                    .repeat = 1,
1119                                  };
1120
1121         DDPRINTF("Processing unregister for chunk: %" PRIu64
1122                  " at position %d\n", chunk, rdma->unregister_current);
1123
1124         rdma->unregistrations[rdma->unregister_current] = 0;
1125         rdma->unregister_current++;
1126
1127         if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1128             rdma->unregister_current = 0;
1129         }
1130
1131
1132         /*
1133          * Unregistration is speculative (because migration is single-threaded
1134          * and we cannot break the protocol's inifinband message ordering).
1135          * Thus, if the memory is currently being used for transmission,
1136          * then abort the attempt to unregister and try again
1137          * later the next time a completion is received for this memory.
1138          */
1139         clear_bit(chunk, block->unregister_bitmap);
1140
1141         if (test_bit(chunk, block->transit_bitmap)) {
1142             DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
1143             continue;
1144         }
1145
1146         DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
1147
1148         ret = ibv_dereg_mr(block->pmr[chunk]);
1149         block->pmr[chunk] = NULL;
1150         block->remote_keys[chunk] = 0;
1151
1152         if (ret != 0) {
1153             perror("unregistration chunk failed");
1154             return -ret;
1155         }
1156         rdma->total_registrations--;
1157
1158         reg.key.chunk = chunk;
1159         register_to_network(&reg);
1160         ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1161                                 &resp, NULL, NULL);
1162         if (ret < 0) {
1163             return ret;
1164         }
1165
1166         DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
1167     }
1168
1169     return 0;
1170 }
1171
1172 static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1173                                          uint64_t chunk)
1174 {
1175     uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1176
1177     result |= (index << RDMA_WRID_BLOCK_SHIFT);
1178     result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1179
1180     return result;
1181 }
1182
1183 /*
1184  * Set bit for unregistration in the next iteration.
1185  * We cannot transmit right here, but will unpin later.
1186  */
1187 static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1188                                         uint64_t chunk, uint64_t wr_id)
1189 {
1190     if (rdma->unregistrations[rdma->unregister_next] != 0) {
1191         fprintf(stderr, "rdma migration: queue is full!\n");
1192     } else {
1193         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1194
1195         if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1196             DDPRINTF("Appending unregister chunk %" PRIu64
1197                     " at position %d\n", chunk, rdma->unregister_next);
1198
1199             rdma->unregistrations[rdma->unregister_next++] =
1200                     qemu_rdma_make_wrid(wr_id, index, chunk);
1201
1202             if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1203                 rdma->unregister_next = 0;
1204             }
1205         } else {
1206             DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
1207                     chunk);
1208         }
1209     }
1210 }
1211
1212 /*
1213  * Consult the connection manager to see a work request
1214  * (of any kind) has completed.
1215  * Return the work request ID that completed.
1216  */
1217 static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out)
1218 {
1219     int ret;
1220     struct ibv_wc wc;
1221     uint64_t wr_id;
1222
1223     ret = ibv_poll_cq(rdma->cq, 1, &wc);
1224
1225     if (!ret) {
1226         *wr_id_out = RDMA_WRID_NONE;
1227         return 0;
1228     }
1229
1230     if (ret < 0) {
1231         fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
1232         return ret;
1233     }
1234
1235     wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1236
1237     if (wc.status != IBV_WC_SUCCESS) {
1238         fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1239                         wc.status, ibv_wc_status_str(wc.status));
1240         fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1241
1242         return -1;
1243     }
1244
1245     if (rdma->control_ready_expected &&
1246         (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1247         DDDPRINTF("completion %s #%" PRId64 " received (%" PRId64 ")"
1248                   " left %d\n", wrid_desc[RDMA_WRID_RECV_CONTROL],
1249                   wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1250         rdma->control_ready_expected = 0;
1251     }
1252
1253     if (wr_id == RDMA_WRID_RDMA_WRITE) {
1254         uint64_t chunk =
1255             (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1256         uint64_t index =
1257             (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1258         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1259
1260         DDDPRINTF("completions %s (%" PRId64 ") left %d, "
1261                  "block %" PRIu64 ", chunk: %" PRIu64 " %p %p\n",
1262                  print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk,
1263                  block->local_host_addr, (void *)block->remote_host_addr);
1264
1265         clear_bit(chunk, block->transit_bitmap);
1266
1267         if (rdma->nb_sent > 0) {
1268             rdma->nb_sent--;
1269         }
1270
1271         if (!rdma->pin_all) {
1272             /*
1273              * FYI: If one wanted to signal a specific chunk to be unregistered
1274              * using LRU or workload-specific information, this is the function
1275              * you would call to do so. That chunk would then get asynchronously
1276              * unregistered later.
1277              */
1278 #ifdef RDMA_UNREGISTRATION_EXAMPLE
1279             qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1280 #endif
1281         }
1282     } else {
1283         DDDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
1284             print_wrid(wr_id), wr_id, rdma->nb_sent);
1285     }
1286
1287     *wr_id_out = wc.wr_id;
1288
1289     return  0;
1290 }
1291
1292 /*
1293  * Block until the next work request has completed.
1294  *
1295  * First poll to see if a work request has already completed,
1296  * otherwise block.
1297  *
1298  * If we encounter completed work requests for IDs other than
1299  * the one we're interested in, then that's generally an error.
1300  *
1301  * The only exception is actual RDMA Write completions. These
1302  * completions only need to be recorded, but do not actually
1303  * need further processing.
1304  */
1305 static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested)
1306 {
1307     int num_cq_events = 0, ret = 0;
1308     struct ibv_cq *cq;
1309     void *cq_ctx;
1310     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1311
1312     if (ibv_req_notify_cq(rdma->cq, 0)) {
1313         return -1;
1314     }
1315     /* poll cq first */
1316     while (wr_id != wrid_requested) {
1317         ret = qemu_rdma_poll(rdma, &wr_id_in);
1318         if (ret < 0) {
1319             return ret;
1320         }
1321
1322         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1323
1324         if (wr_id == RDMA_WRID_NONE) {
1325             break;
1326         }
1327         if (wr_id != wrid_requested) {
1328             DDDPRINTF("A Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
1329                 print_wrid(wrid_requested),
1330                 wrid_requested, print_wrid(wr_id), wr_id);
1331         }
1332     }
1333
1334     if (wr_id == wrid_requested) {
1335         return 0;
1336     }
1337
1338     while (1) {
1339         /*
1340          * Coroutine doesn't start until process_incoming_migration()
1341          * so don't yield unless we know we're running inside of a coroutine.
1342          */
1343         if (rdma->migration_started_on_destination) {
1344             yield_until_fd_readable(rdma->comp_channel->fd);
1345         }
1346
1347         if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) {
1348             perror("ibv_get_cq_event");
1349             goto err_block_for_wrid;
1350         }
1351
1352         num_cq_events++;
1353
1354         if (ibv_req_notify_cq(cq, 0)) {
1355             goto err_block_for_wrid;
1356         }
1357
1358         while (wr_id != wrid_requested) {
1359             ret = qemu_rdma_poll(rdma, &wr_id_in);
1360             if (ret < 0) {
1361                 goto err_block_for_wrid;
1362             }
1363
1364             wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1365
1366             if (wr_id == RDMA_WRID_NONE) {
1367                 break;
1368             }
1369             if (wr_id != wrid_requested) {
1370                 DDDPRINTF("B Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
1371                     print_wrid(wrid_requested), wrid_requested,
1372                     print_wrid(wr_id), wr_id);
1373             }
1374         }
1375
1376         if (wr_id == wrid_requested) {
1377             goto success_block_for_wrid;
1378         }
1379     }
1380
1381 success_block_for_wrid:
1382     if (num_cq_events) {
1383         ibv_ack_cq_events(cq, num_cq_events);
1384     }
1385     return 0;
1386
1387 err_block_for_wrid:
1388     if (num_cq_events) {
1389         ibv_ack_cq_events(cq, num_cq_events);
1390     }
1391     return ret;
1392 }
1393
1394 /*
1395  * Post a SEND message work request for the control channel
1396  * containing some data and block until the post completes.
1397  */
1398 static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1399                                        RDMAControlHeader *head)
1400 {
1401     int ret = 0;
1402     RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_MAX];
1403     struct ibv_send_wr *bad_wr;
1404     struct ibv_sge sge = {
1405                            .addr = (uint64_t)(wr->control),
1406                            .length = head->len + sizeof(RDMAControlHeader),
1407                            .lkey = wr->control_mr->lkey,
1408                          };
1409     struct ibv_send_wr send_wr = {
1410                                    .wr_id = RDMA_WRID_SEND_CONTROL,
1411                                    .opcode = IBV_WR_SEND,
1412                                    .send_flags = IBV_SEND_SIGNALED,
1413                                    .sg_list = &sge,
1414                                    .num_sge = 1,
1415                                 };
1416
1417     DDDPRINTF("CONTROL: sending %s..\n", control_desc[head->type]);
1418
1419     /*
1420      * We don't actually need to do a memcpy() in here if we used
1421      * the "sge" properly, but since we're only sending control messages
1422      * (not RAM in a performance-critical path), then its OK for now.
1423      *
1424      * The copy makes the RDMAControlHeader simpler to manipulate
1425      * for the time being.
1426      */
1427     memcpy(wr->control, head, sizeof(RDMAControlHeader));
1428     control_to_network((void *) wr->control);
1429
1430     if (buf) {
1431         memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1432     }
1433
1434
1435     if (ibv_post_send(rdma->qp, &send_wr, &bad_wr)) {
1436         return -1;
1437     }
1438
1439     if (ret < 0) {
1440         fprintf(stderr, "Failed to use post IB SEND for control!\n");
1441         return ret;
1442     }
1443
1444     ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL);
1445     if (ret < 0) {
1446         fprintf(stderr, "rdma migration: send polling control error!\n");
1447     }
1448
1449     return ret;
1450 }
1451
1452 /*
1453  * Post a RECV work request in anticipation of some future receipt
1454  * of data on the control channel.
1455  */
1456 static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1457 {
1458     struct ibv_recv_wr *bad_wr;
1459     struct ibv_sge sge = {
1460                             .addr = (uint64_t)(rdma->wr_data[idx].control),
1461                             .length = RDMA_CONTROL_MAX_BUFFER,
1462                             .lkey = rdma->wr_data[idx].control_mr->lkey,
1463                          };
1464
1465     struct ibv_recv_wr recv_wr = {
1466                                     .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1467                                     .sg_list = &sge,
1468                                     .num_sge = 1,
1469                                  };
1470
1471
1472     if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1473         return -1;
1474     }
1475
1476     return 0;
1477 }
1478
1479 /*
1480  * Block and wait for a RECV control channel message to arrive.
1481  */
1482 static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1483                 RDMAControlHeader *head, int expecting, int idx)
1484 {
1485     int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx);
1486
1487     if (ret < 0) {
1488         fprintf(stderr, "rdma migration: recv polling control error!\n");
1489         return ret;
1490     }
1491
1492     network_to_control((void *) rdma->wr_data[idx].control);
1493     memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1494
1495     DDDPRINTF("CONTROL: %s receiving...\n", control_desc[expecting]);
1496
1497     if (expecting == RDMA_CONTROL_NONE) {
1498         DDDPRINTF("Surprise: got %s (%d)\n",
1499                   control_desc[head->type], head->type);
1500     } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1501         fprintf(stderr, "Was expecting a %s (%d) control message"
1502                 ", but got: %s (%d), length: %d\n",
1503                 control_desc[expecting], expecting,
1504                 control_desc[head->type], head->type, head->len);
1505         return -EIO;
1506     }
1507
1508     return 0;
1509 }
1510
1511 /*
1512  * When a RECV work request has completed, the work request's
1513  * buffer is pointed at the header.
1514  *
1515  * This will advance the pointer to the data portion
1516  * of the control message of the work request's buffer that
1517  * was populated after the work request finished.
1518  */
1519 static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1520                                   RDMAControlHeader *head)
1521 {
1522     rdma->wr_data[idx].control_len = head->len;
1523     rdma->wr_data[idx].control_curr =
1524         rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1525 }
1526
1527 /*
1528  * This is an 'atomic' high-level operation to deliver a single, unified
1529  * control-channel message.
1530  *
1531  * Additionally, if the user is expecting some kind of reply to this message,
1532  * they can request a 'resp' response message be filled in by posting an
1533  * additional work request on behalf of the user and waiting for an additional
1534  * completion.
1535  *
1536  * The extra (optional) response is used during registration to us from having
1537  * to perform an *additional* exchange of message just to provide a response by
1538  * instead piggy-backing on the acknowledgement.
1539  */
1540 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1541                                    uint8_t *data, RDMAControlHeader *resp,
1542                                    int *resp_idx,
1543                                    int (*callback)(RDMAContext *rdma))
1544 {
1545     int ret = 0;
1546
1547     /*
1548      * Wait until the dest is ready before attempting to deliver the message
1549      * by waiting for a READY message.
1550      */
1551     if (rdma->control_ready_expected) {
1552         RDMAControlHeader resp;
1553         ret = qemu_rdma_exchange_get_response(rdma,
1554                                     &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1555         if (ret < 0) {
1556             return ret;
1557         }
1558     }
1559
1560     /*
1561      * If the user is expecting a response, post a WR in anticipation of it.
1562      */
1563     if (resp) {
1564         ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1565         if (ret) {
1566             fprintf(stderr, "rdma migration: error posting"
1567                     " extra control recv for anticipated result!");
1568             return ret;
1569         }
1570     }
1571
1572     /*
1573      * Post a WR to replace the one we just consumed for the READY message.
1574      */
1575     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1576     if (ret) {
1577         fprintf(stderr, "rdma migration: error posting first control recv!");
1578         return ret;
1579     }
1580
1581     /*
1582      * Deliver the control message that was requested.
1583      */
1584     ret = qemu_rdma_post_send_control(rdma, data, head);
1585
1586     if (ret < 0) {
1587         fprintf(stderr, "Failed to send control buffer!\n");
1588         return ret;
1589     }
1590
1591     /*
1592      * If we're expecting a response, block and wait for it.
1593      */
1594     if (resp) {
1595         if (callback) {
1596             DDPRINTF("Issuing callback before receiving response...\n");
1597             ret = callback(rdma);
1598             if (ret < 0) {
1599                 return ret;
1600             }
1601         }
1602
1603         DDPRINTF("Waiting for response %s\n", control_desc[resp->type]);
1604         ret = qemu_rdma_exchange_get_response(rdma, resp,
1605                                               resp->type, RDMA_WRID_DATA);
1606
1607         if (ret < 0) {
1608             return ret;
1609         }
1610
1611         qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1612         if (resp_idx) {
1613             *resp_idx = RDMA_WRID_DATA;
1614         }
1615         DDPRINTF("Response %s received.\n", control_desc[resp->type]);
1616     }
1617
1618     rdma->control_ready_expected = 1;
1619
1620     return 0;
1621 }
1622
1623 /*
1624  * This is an 'atomic' high-level operation to receive a single, unified
1625  * control-channel message.
1626  */
1627 static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1628                                 int expecting)
1629 {
1630     RDMAControlHeader ready = {
1631                                 .len = 0,
1632                                 .type = RDMA_CONTROL_READY,
1633                                 .repeat = 1,
1634                               };
1635     int ret;
1636
1637     /*
1638      * Inform the source that we're ready to receive a message.
1639      */
1640     ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1641
1642     if (ret < 0) {
1643         fprintf(stderr, "Failed to send control buffer!\n");
1644         return ret;
1645     }
1646
1647     /*
1648      * Block and wait for the message.
1649      */
1650     ret = qemu_rdma_exchange_get_response(rdma, head,
1651                                           expecting, RDMA_WRID_READY);
1652
1653     if (ret < 0) {
1654         return ret;
1655     }
1656
1657     qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1658
1659     /*
1660      * Post a new RECV work request to replace the one we just consumed.
1661      */
1662     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1663     if (ret) {
1664         fprintf(stderr, "rdma migration: error posting second control recv!");
1665         return ret;
1666     }
1667
1668     return 0;
1669 }
1670
1671 /*
1672  * Write an actual chunk of memory using RDMA.
1673  *
1674  * If we're using dynamic registration on the dest-side, we have to
1675  * send a registration command first.
1676  */
1677 static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1678                                int current_index, uint64_t current_addr,
1679                                uint64_t length)
1680 {
1681     struct ibv_sge sge;
1682     struct ibv_send_wr send_wr = { 0 };
1683     struct ibv_send_wr *bad_wr;
1684     int reg_result_idx, ret, count = 0;
1685     uint64_t chunk, chunks;
1686     uint8_t *chunk_start, *chunk_end;
1687     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1688     RDMARegister reg;
1689     RDMARegisterResult *reg_result;
1690     RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1691     RDMAControlHeader head = { .len = sizeof(RDMARegister),
1692                                .type = RDMA_CONTROL_REGISTER_REQUEST,
1693                                .repeat = 1,
1694                              };
1695
1696 retry:
1697     sge.addr = (uint64_t)(block->local_host_addr +
1698                             (current_addr - block->offset));
1699     sge.length = length;
1700
1701     chunk = ram_chunk_index(block->local_host_addr, (uint8_t *) sge.addr);
1702     chunk_start = ram_chunk_start(block, chunk);
1703
1704     if (block->is_ram_block) {
1705         chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
1706
1707         if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1708             chunks--;
1709         }
1710     } else {
1711         chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
1712
1713         if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1714             chunks--;
1715         }
1716     }
1717
1718     DDPRINTF("Writing %" PRIu64 " chunks, (%" PRIu64 " MB)\n",
1719         chunks + 1, (chunks + 1) * (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
1720
1721     chunk_end = ram_chunk_end(block, chunk + chunks);
1722
1723     if (!rdma->pin_all) {
1724 #ifdef RDMA_UNREGISTRATION_EXAMPLE
1725         qemu_rdma_unregister_waiting(rdma);
1726 #endif
1727     }
1728
1729     while (test_bit(chunk, block->transit_bitmap)) {
1730         (void)count;
1731         DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
1732                 " current %" PRIu64 " len %" PRIu64 " %d %d\n",
1733                 count++, current_index, chunk,
1734                 sge.addr, length, rdma->nb_sent, block->nb_chunks);
1735
1736         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE);
1737
1738         if (ret < 0) {
1739             fprintf(stderr, "Failed to Wait for previous write to complete "
1740                     "block %d chunk %" PRIu64
1741                     " current %" PRIu64 " len %" PRIu64 " %d\n",
1742                     current_index, chunk, sge.addr, length, rdma->nb_sent);
1743             return ret;
1744         }
1745     }
1746
1747     if (!rdma->pin_all || !block->is_ram_block) {
1748         if (!block->remote_keys[chunk]) {
1749             /*
1750              * This chunk has not yet been registered, so first check to see
1751              * if the entire chunk is zero. If so, tell the other size to
1752              * memset() + madvise() the entire chunk without RDMA.
1753              */
1754
1755             if (can_use_buffer_find_nonzero_offset((void *)sge.addr, length)
1756                    && buffer_find_nonzero_offset((void *)sge.addr,
1757                                                     length) == length) {
1758                 RDMACompress comp = {
1759                                         .offset = current_addr,
1760                                         .value = 0,
1761                                         .block_idx = current_index,
1762                                         .length = length,
1763                                     };
1764
1765                 head.len = sizeof(comp);
1766                 head.type = RDMA_CONTROL_COMPRESS;
1767
1768                 DDPRINTF("Entire chunk is zero, sending compress: %"
1769                     PRIu64 " for %d "
1770                     "bytes, index: %d, offset: %" PRId64 "...\n",
1771                     chunk, sge.length, current_index, current_addr);
1772
1773                 compress_to_network(&comp);
1774                 ret = qemu_rdma_exchange_send(rdma, &head,
1775                                 (uint8_t *) &comp, NULL, NULL, NULL);
1776
1777                 if (ret < 0) {
1778                     return -EIO;
1779                 }
1780
1781                 acct_update_position(f, sge.length, true);
1782
1783                 return 1;
1784             }
1785
1786             /*
1787              * Otherwise, tell other side to register.
1788              */
1789             reg.current_index = current_index;
1790             if (block->is_ram_block) {
1791                 reg.key.current_addr = current_addr;
1792             } else {
1793                 reg.key.chunk = chunk;
1794             }
1795             reg.chunks = chunks;
1796
1797             DDPRINTF("Sending registration request chunk %" PRIu64 " for %d "
1798                     "bytes, index: %d, offset: %" PRId64 "...\n",
1799                     chunk, sge.length, current_index, current_addr);
1800
1801             register_to_network(&reg);
1802             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1803                                     &resp, &reg_result_idx, NULL);
1804             if (ret < 0) {
1805                 return ret;
1806             }
1807
1808             /* try to overlap this single registration with the one we sent. */
1809             if (qemu_rdma_register_and_get_keys(rdma, block,
1810                                                 (uint8_t *) sge.addr,
1811                                                 &sge.lkey, NULL, chunk,
1812                                                 chunk_start, chunk_end)) {
1813                 fprintf(stderr, "cannot get lkey!\n");
1814                 return -EINVAL;
1815             }
1816
1817             reg_result = (RDMARegisterResult *)
1818                     rdma->wr_data[reg_result_idx].control_curr;
1819
1820             network_to_result(reg_result);
1821
1822             DDPRINTF("Received registration result:"
1823                     " my key: %x their key %x, chunk %" PRIu64 "\n",
1824                     block->remote_keys[chunk], reg_result->rkey, chunk);
1825
1826             block->remote_keys[chunk] = reg_result->rkey;
1827             block->remote_host_addr = reg_result->host_addr;
1828         } else {
1829             /* already registered before */
1830             if (qemu_rdma_register_and_get_keys(rdma, block,
1831                                                 (uint8_t *)sge.addr,
1832                                                 &sge.lkey, NULL, chunk,
1833                                                 chunk_start, chunk_end)) {
1834                 fprintf(stderr, "cannot get lkey!\n");
1835                 return -EINVAL;
1836             }
1837         }
1838
1839         send_wr.wr.rdma.rkey = block->remote_keys[chunk];
1840     } else {
1841         send_wr.wr.rdma.rkey = block->remote_rkey;
1842
1843         if (qemu_rdma_register_and_get_keys(rdma, block, (uint8_t *)sge.addr,
1844                                                      &sge.lkey, NULL, chunk,
1845                                                      chunk_start, chunk_end)) {
1846             fprintf(stderr, "cannot get lkey!\n");
1847             return -EINVAL;
1848         }
1849     }
1850
1851     /*
1852      * Encode the ram block index and chunk within this wrid.
1853      * We will use this information at the time of completion
1854      * to figure out which bitmap to check against and then which
1855      * chunk in the bitmap to look for.
1856      */
1857     send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
1858                                         current_index, chunk);
1859
1860     send_wr.opcode = IBV_WR_RDMA_WRITE;
1861     send_wr.send_flags = IBV_SEND_SIGNALED;
1862     send_wr.sg_list = &sge;
1863     send_wr.num_sge = 1;
1864     send_wr.wr.rdma.remote_addr = block->remote_host_addr +
1865                                 (current_addr - block->offset);
1866
1867     DDDPRINTF("Posting chunk: %" PRIu64 ", addr: %lx"
1868               " remote: %lx, bytes %" PRIu32 "\n",
1869               chunk, sge.addr, send_wr.wr.rdma.remote_addr,
1870               sge.length);
1871
1872     /*
1873      * ibv_post_send() does not return negative error numbers,
1874      * per the specification they are positive - no idea why.
1875      */
1876     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1877
1878     if (ret == ENOMEM) {
1879         DDPRINTF("send queue is full. wait a little....\n");
1880         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE);
1881         if (ret < 0) {
1882             fprintf(stderr, "rdma migration: failed to make "
1883                             "room in full send queue! %d\n", ret);
1884             return ret;
1885         }
1886
1887         goto retry;
1888
1889     } else if (ret > 0) {
1890         perror("rdma migration: post rdma write failed");
1891         return -ret;
1892     }
1893
1894     set_bit(chunk, block->transit_bitmap);
1895     acct_update_position(f, sge.length, false);
1896     rdma->total_writes++;
1897
1898     return 0;
1899 }
1900
1901 /*
1902  * Push out any unwritten RDMA operations.
1903  *
1904  * We support sending out multiple chunks at the same time.
1905  * Not all of them need to get signaled in the completion queue.
1906  */
1907 static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
1908 {
1909     int ret;
1910
1911     if (!rdma->current_length) {
1912         return 0;
1913     }
1914
1915     ret = qemu_rdma_write_one(f, rdma,
1916             rdma->current_index, rdma->current_addr, rdma->current_length);
1917
1918     if (ret < 0) {
1919         return ret;
1920     }
1921
1922     if (ret == 0) {
1923         rdma->nb_sent++;
1924         DDDPRINTF("sent total: %d\n", rdma->nb_sent);
1925     }
1926
1927     rdma->current_length = 0;
1928     rdma->current_addr = 0;
1929
1930     return 0;
1931 }
1932
1933 static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
1934                     uint64_t offset, uint64_t len)
1935 {
1936     RDMALocalBlock *block =
1937         &(rdma->local_ram_blocks.block[rdma->current_index]);
1938     uint8_t *host_addr = block->local_host_addr + (offset - block->offset);
1939     uint8_t *chunk_end = ram_chunk_end(block, rdma->current_chunk);
1940
1941     if (rdma->current_length == 0) {
1942         return 0;
1943     }
1944
1945     /*
1946      * Only merge into chunk sequentially.
1947      */
1948     if (offset != (rdma->current_addr + rdma->current_length)) {
1949         return 0;
1950     }
1951
1952     if (rdma->current_index < 0) {
1953         return 0;
1954     }
1955
1956     if (offset < block->offset) {
1957         return 0;
1958     }
1959
1960     if ((offset + len) > (block->offset + block->length)) {
1961         return 0;
1962     }
1963
1964     if (rdma->current_chunk < 0) {
1965         return 0;
1966     }
1967
1968     if ((host_addr + len) > chunk_end) {
1969         return 0;
1970     }
1971
1972     return 1;
1973 }
1974
1975 /*
1976  * We're not actually writing here, but doing three things:
1977  *
1978  * 1. Identify the chunk the buffer belongs to.
1979  * 2. If the chunk is full or the buffer doesn't belong to the current
1980  *    chunk, then start a new chunk and flush() the old chunk.
1981  * 3. To keep the hardware busy, we also group chunks into batches
1982  *    and only require that a batch gets acknowledged in the completion
1983  *    qeueue instead of each individual chunk.
1984  */
1985 static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
1986                            uint64_t block_offset, uint64_t offset,
1987                            uint64_t len)
1988 {
1989     uint64_t current_addr = block_offset + offset;
1990     uint64_t index = rdma->current_index;
1991     uint64_t chunk = rdma->current_chunk;
1992     int ret;
1993
1994     /* If we cannot merge it, we flush the current buffer first. */
1995     if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
1996         ret = qemu_rdma_write_flush(f, rdma);
1997         if (ret) {
1998             return ret;
1999         }
2000         rdma->current_length = 0;
2001         rdma->current_addr = current_addr;
2002
2003         ret = qemu_rdma_search_ram_block(rdma, block_offset,
2004                                          offset, len, &index, &chunk);
2005         if (ret) {
2006             fprintf(stderr, "ram block search failed\n");
2007             return ret;
2008         }
2009         rdma->current_index = index;
2010         rdma->current_chunk = chunk;
2011     }
2012
2013     /* merge it */
2014     rdma->current_length += len;
2015
2016     /* flush it if buffer is too large */
2017     if (rdma->current_length >= RDMA_MERGE_MAX) {
2018         return qemu_rdma_write_flush(f, rdma);
2019     }
2020
2021     return 0;
2022 }
2023
2024 static void qemu_rdma_cleanup(RDMAContext *rdma)
2025 {
2026     struct rdma_cm_event *cm_event;
2027     int ret, idx;
2028
2029     if (rdma->cm_id) {
2030         if (rdma->error_state) {
2031             RDMAControlHeader head = { .len = 0,
2032                                        .type = RDMA_CONTROL_ERROR,
2033                                        .repeat = 1,
2034                                      };
2035             fprintf(stderr, "Early error. Sending error.\n");
2036             qemu_rdma_post_send_control(rdma, NULL, &head);
2037         }
2038
2039         ret = rdma_disconnect(rdma->cm_id);
2040         if (!ret) {
2041             DDPRINTF("waiting for disconnect\n");
2042             ret = rdma_get_cm_event(rdma->channel, &cm_event);
2043             if (!ret) {
2044                 rdma_ack_cm_event(cm_event);
2045             }
2046         }
2047         DDPRINTF("Disconnected.\n");
2048         rdma->cm_id = NULL;
2049     }
2050
2051     g_free(rdma->block);
2052     rdma->block = NULL;
2053
2054     for (idx = 0; idx <= RDMA_WRID_MAX; idx++) {
2055         if (rdma->wr_data[idx].control_mr) {
2056             rdma->total_registrations--;
2057             ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2058         }
2059         rdma->wr_data[idx].control_mr = NULL;
2060     }
2061
2062     if (rdma->local_ram_blocks.block) {
2063         while (rdma->local_ram_blocks.nb_blocks) {
2064             __qemu_rdma_delete_block(rdma,
2065                     rdma->local_ram_blocks.block->offset);
2066         }
2067     }
2068
2069     if (rdma->qp) {
2070         ibv_destroy_qp(rdma->qp);
2071         rdma->qp = NULL;
2072     }
2073     if (rdma->cq) {
2074         ibv_destroy_cq(rdma->cq);
2075         rdma->cq = NULL;
2076     }
2077     if (rdma->comp_channel) {
2078         ibv_destroy_comp_channel(rdma->comp_channel);
2079         rdma->comp_channel = NULL;
2080     }
2081     if (rdma->pd) {
2082         ibv_dealloc_pd(rdma->pd);
2083         rdma->pd = NULL;
2084     }
2085     if (rdma->listen_id) {
2086         rdma_destroy_id(rdma->listen_id);
2087         rdma->listen_id = NULL;
2088     }
2089     if (rdma->cm_id) {
2090         rdma_destroy_id(rdma->cm_id);
2091         rdma->cm_id = NULL;
2092     }
2093     if (rdma->channel) {
2094         rdma_destroy_event_channel(rdma->channel);
2095         rdma->channel = NULL;
2096     }
2097 }
2098
2099
2100 static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all)
2101 {
2102     int ret, idx;
2103     Error *local_err = NULL, **temp = &local_err;
2104
2105     /*
2106      * Will be validated against destination's actual capabilities
2107      * after the connect() completes.
2108      */
2109     rdma->pin_all = pin_all;
2110
2111     ret = qemu_rdma_resolve_host(rdma, temp);
2112     if (ret) {
2113         goto err_rdma_source_init;
2114     }
2115
2116     ret = qemu_rdma_alloc_pd_cq(rdma);
2117     if (ret) {
2118         ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2119                     " limits may be too low. Please check $ ulimit -a # and "
2120                     "search for 'ulimit -l' in the output\n");
2121         goto err_rdma_source_init;
2122     }
2123
2124     ret = qemu_rdma_alloc_qp(rdma);
2125     if (ret) {
2126         ERROR(temp, "rdma migration: error allocating qp!\n");
2127         goto err_rdma_source_init;
2128     }
2129
2130     ret = qemu_rdma_init_ram_blocks(rdma);
2131     if (ret) {
2132         ERROR(temp, "rdma migration: error initializing ram blocks!\n");
2133         goto err_rdma_source_init;
2134     }
2135
2136     for (idx = 0; idx <= RDMA_WRID_MAX; idx++) {
2137         ret = qemu_rdma_reg_control(rdma, idx);
2138         if (ret) {
2139             ERROR(temp, "rdma migration: error registering %d control!\n",
2140                                                             idx);
2141             goto err_rdma_source_init;
2142         }
2143     }
2144
2145     return 0;
2146
2147 err_rdma_source_init:
2148     error_propagate(errp, local_err);
2149     qemu_rdma_cleanup(rdma);
2150     return -1;
2151 }
2152
2153 static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2154 {
2155     RDMACapabilities cap = {
2156                                 .version = RDMA_CONTROL_VERSION_CURRENT,
2157                                 .flags = 0,
2158                            };
2159     struct rdma_conn_param conn_param = { .initiator_depth = 2,
2160                                           .retry_count = 5,
2161                                           .private_data = &cap,
2162                                           .private_data_len = sizeof(cap),
2163                                         };
2164     struct rdma_cm_event *cm_event;
2165     int ret;
2166
2167     /*
2168      * Only negotiate the capability with destination if the user
2169      * on the source first requested the capability.
2170      */
2171     if (rdma->pin_all) {
2172         DPRINTF("Server pin-all memory requested.\n");
2173         cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2174     }
2175
2176     caps_to_network(&cap);
2177
2178     ret = rdma_connect(rdma->cm_id, &conn_param);
2179     if (ret) {
2180         perror("rdma_connect");
2181         ERROR(errp, "connecting to destination!\n");
2182         rdma_destroy_id(rdma->cm_id);
2183         rdma->cm_id = NULL;
2184         goto err_rdma_source_connect;
2185     }
2186
2187     ret = rdma_get_cm_event(rdma->channel, &cm_event);
2188     if (ret) {
2189         perror("rdma_get_cm_event after rdma_connect");
2190         ERROR(errp, "connecting to destination!\n");
2191         rdma_ack_cm_event(cm_event);
2192         rdma_destroy_id(rdma->cm_id);
2193         rdma->cm_id = NULL;
2194         goto err_rdma_source_connect;
2195     }
2196
2197     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2198         perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2199         ERROR(errp, "connecting to destination!\n");
2200         rdma_ack_cm_event(cm_event);
2201         rdma_destroy_id(rdma->cm_id);
2202         rdma->cm_id = NULL;
2203         goto err_rdma_source_connect;
2204     }
2205
2206     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2207     network_to_caps(&cap);
2208
2209     /*
2210      * Verify that the *requested* capabilities are supported by the destination
2211      * and disable them otherwise.
2212      */
2213     if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2214         ERROR(errp, "Server cannot support pinning all memory. "
2215                         "Will register memory dynamically.\n");
2216         rdma->pin_all = false;
2217     }
2218
2219     DPRINTF("Pin all memory: %s\n", rdma->pin_all ? "enabled" : "disabled");
2220
2221     rdma_ack_cm_event(cm_event);
2222
2223     ret = qemu_rdma_post_recv_control(rdma, 0);
2224     if (ret) {
2225         ERROR(errp, "posting second control recv!\n");
2226         goto err_rdma_source_connect;
2227     }
2228
2229     rdma->control_ready_expected = 1;
2230     rdma->nb_sent = 0;
2231     return 0;
2232
2233 err_rdma_source_connect:
2234     qemu_rdma_cleanup(rdma);
2235     return -1;
2236 }
2237
2238 static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2239 {
2240     int ret = -EINVAL, idx;
2241     int af = rdma->ipv6 ? PF_INET6 : PF_INET;
2242     struct sockaddr_in sin;
2243     struct rdma_cm_id *listen_id;
2244     char ip[40] = "unknown";
2245     struct addrinfo *res;
2246     char port_str[16];
2247
2248     for (idx = 0; idx <= RDMA_WRID_MAX; idx++) {
2249         rdma->wr_data[idx].control_len = 0;
2250         rdma->wr_data[idx].control_curr = NULL;
2251     }
2252
2253     if (rdma->host == NULL) {
2254         ERROR(errp, "RDMA host is not set!\n");
2255         rdma->error_state = -EINVAL;
2256         return -1;
2257     }
2258     /* create CM channel */
2259     rdma->channel = rdma_create_event_channel();
2260     if (!rdma->channel) {
2261         ERROR(errp, "could not create rdma event channel\n");
2262         rdma->error_state = -EINVAL;
2263         return -1;
2264     }
2265
2266     /* create CM id */
2267     ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2268     if (ret) {
2269         ERROR(errp, "could not create cm_id!\n");
2270         goto err_dest_init_create_listen_id;
2271     }
2272
2273     memset(&sin, 0, sizeof(sin));
2274     sin.sin_family = af;
2275     sin.sin_port = htons(rdma->port);
2276     snprintf(port_str, 16, "%d", rdma->port);
2277     port_str[15] = '\0';
2278
2279     if (rdma->host && strcmp("", rdma->host)) {
2280         ret = getaddrinfo(rdma->host, port_str, NULL, &res);
2281         if (ret < 0) {
2282             ERROR(errp, "could not getaddrinfo address %s\n", rdma->host);
2283             goto err_dest_init_bind_addr;
2284         }
2285
2286
2287         inet_ntop(af, &((struct sockaddr_in *) res->ai_addr)->sin_addr,
2288                                     ip, sizeof ip);
2289     } else {
2290         ERROR(errp, "migration host and port not specified!\n");
2291         ret = -EINVAL;
2292         goto err_dest_init_bind_addr;
2293     }
2294
2295     DPRINTF("%s => %s\n", rdma->host, ip);
2296
2297     ret = rdma_bind_addr(listen_id, res->ai_addr);
2298     if (ret) {
2299         ERROR(errp, "Error: could not rdma_bind_addr!\n");
2300         goto err_dest_init_bind_addr;
2301     }
2302
2303     rdma->listen_id = listen_id;
2304     qemu_rdma_dump_gid("dest_init", listen_id);
2305     return 0;
2306
2307 err_dest_init_bind_addr:
2308     rdma_destroy_id(listen_id);
2309 err_dest_init_create_listen_id:
2310     rdma_destroy_event_channel(rdma->channel);
2311     rdma->channel = NULL;
2312     rdma->error_state = ret;
2313     return ret;
2314
2315 }
2316
2317 static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2318 {
2319     RDMAContext *rdma = NULL;
2320     InetSocketAddress *addr;
2321
2322     if (host_port) {
2323         rdma = g_malloc0(sizeof(RDMAContext));
2324         memset(rdma, 0, sizeof(RDMAContext));
2325         rdma->current_index = -1;
2326         rdma->current_chunk = -1;
2327
2328         addr = inet_parse(host_port, NULL);
2329         if (addr != NULL) {
2330             rdma->port = atoi(addr->port);
2331             rdma->host = g_strdup(addr->host);
2332             rdma->ipv6 = addr->ipv6;
2333         } else {
2334             ERROR(errp, "bad RDMA migration address '%s'", host_port);
2335             g_free(rdma);
2336             return NULL;
2337         }
2338     }
2339
2340     return rdma;
2341 }
2342
2343 /*
2344  * QEMUFile interface to the control channel.
2345  * SEND messages for control only.
2346  * pc.ram is handled with regular RDMA messages.
2347  */
2348 static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
2349                                 int64_t pos, int size)
2350 {
2351     QEMUFileRDMA *r = opaque;
2352     QEMUFile *f = r->file;
2353     RDMAContext *rdma = r->rdma;
2354     size_t remaining = size;
2355     uint8_t * data = (void *) buf;
2356     int ret;
2357
2358     CHECK_ERROR_STATE();
2359
2360     /*
2361      * Push out any writes that
2362      * we're queued up for pc.ram.
2363      */
2364     ret = qemu_rdma_write_flush(f, rdma);
2365     if (ret < 0) {
2366         rdma->error_state = ret;
2367         return ret;
2368     }
2369
2370     while (remaining) {
2371         RDMAControlHeader head;
2372
2373         r->len = MIN(remaining, RDMA_SEND_INCREMENT);
2374         remaining -= r->len;
2375
2376         head.len = r->len;
2377         head.type = RDMA_CONTROL_QEMU_FILE;
2378
2379         ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2380
2381         if (ret < 0) {
2382             rdma->error_state = ret;
2383             return ret;
2384         }
2385
2386         data += r->len;
2387     }
2388
2389     return size;
2390 }
2391
2392 static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2393                              int size, int idx)
2394 {
2395     size_t len = 0;
2396
2397     if (rdma->wr_data[idx].control_len) {
2398         DDDPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",
2399                     rdma->wr_data[idx].control_len, size);
2400
2401         len = MIN(size, rdma->wr_data[idx].control_len);
2402         memcpy(buf, rdma->wr_data[idx].control_curr, len);
2403         rdma->wr_data[idx].control_curr += len;
2404         rdma->wr_data[idx].control_len -= len;
2405     }
2406
2407     return len;
2408 }
2409
2410 /*
2411  * QEMUFile interface to the control channel.
2412  * RDMA links don't use bytestreams, so we have to
2413  * return bytes to QEMUFile opportunistically.
2414  */
2415 static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
2416                                 int64_t pos, int size)
2417 {
2418     QEMUFileRDMA *r = opaque;
2419     RDMAContext *rdma = r->rdma;
2420     RDMAControlHeader head;
2421     int ret = 0;
2422
2423     CHECK_ERROR_STATE();
2424
2425     /*
2426      * First, we hold on to the last SEND message we
2427      * were given and dish out the bytes until we run
2428      * out of bytes.
2429      */
2430     r->len = qemu_rdma_fill(r->rdma, buf, size, 0);
2431     if (r->len) {
2432         return r->len;
2433     }
2434
2435     /*
2436      * Once we run out, we block and wait for another
2437      * SEND message to arrive.
2438      */
2439     ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2440
2441     if (ret < 0) {
2442         rdma->error_state = ret;
2443         return ret;
2444     }
2445
2446     /*
2447      * SEND was received with new bytes, now try again.
2448      */
2449     return qemu_rdma_fill(r->rdma, buf, size, 0);
2450 }
2451
2452 /*
2453  * Block until all the outstanding chunks have been delivered by the hardware.
2454  */
2455 static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2456 {
2457     int ret;
2458
2459     if (qemu_rdma_write_flush(f, rdma) < 0) {
2460         return -EIO;
2461     }
2462
2463     while (rdma->nb_sent) {
2464         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE);
2465         if (ret < 0) {
2466             fprintf(stderr, "rdma migration: complete polling error!\n");
2467             return -EIO;
2468         }
2469     }
2470
2471     qemu_rdma_unregister_waiting(rdma);
2472
2473     return 0;
2474 }
2475
2476 static int qemu_rdma_close(void *opaque)
2477 {
2478     DPRINTF("Shutting down connection.\n");
2479     QEMUFileRDMA *r = opaque;
2480     if (r->rdma) {
2481         qemu_rdma_cleanup(r->rdma);
2482         g_free(r->rdma);
2483     }
2484     g_free(r);
2485     return 0;
2486 }
2487
2488 /*
2489  * Parameters:
2490  *    @offset == 0 :
2491  *        This means that 'block_offset' is a full virtual address that does not
2492  *        belong to a RAMBlock of the virtual machine and instead
2493  *        represents a private malloc'd memory area that the caller wishes to
2494  *        transfer.
2495  *
2496  *    @offset != 0 :
2497  *        Offset is an offset to be added to block_offset and used
2498  *        to also lookup the corresponding RAMBlock.
2499  *
2500  *    @size > 0 :
2501  *        Initiate an transfer this size.
2502  *
2503  *    @size == 0 :
2504  *        A 'hint' or 'advice' that means that we wish to speculatively
2505  *        and asynchronously unregister this memory. In this case, there is no
2506  *        guarantee that the unregister will actually happen, for example,
2507  *        if the memory is being actively transmitted. Additionally, the memory
2508  *        may be re-registered at any future time if a write within the same
2509  *        chunk was requested again, even if you attempted to unregister it
2510  *        here.
2511  *
2512  *    @size < 0 : TODO, not yet supported
2513  *        Unregister the memory NOW. This means that the caller does not
2514  *        expect there to be any future RDMA transfers and we just want to clean
2515  *        things up. This is used in case the upper layer owns the memory and
2516  *        cannot wait for qemu_fclose() to occur.
2517  *
2518  *    @bytes_sent : User-specificed pointer to indicate how many bytes were
2519  *                  sent. Usually, this will not be more than a few bytes of
2520  *                  the protocol because most transfers are sent asynchronously.
2521  */
2522 static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
2523                                   ram_addr_t block_offset, ram_addr_t offset,
2524                                   size_t size, int *bytes_sent)
2525 {
2526     QEMUFileRDMA *rfile = opaque;
2527     RDMAContext *rdma = rfile->rdma;
2528     int ret;
2529
2530     CHECK_ERROR_STATE();
2531
2532     qemu_fflush(f);
2533
2534     if (size > 0) {
2535         /*
2536          * Add this page to the current 'chunk'. If the chunk
2537          * is full, or the page doen't belong to the current chunk,
2538          * an actual RDMA write will occur and a new chunk will be formed.
2539          */
2540         ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
2541         if (ret < 0) {
2542             fprintf(stderr, "rdma migration: write error! %d\n", ret);
2543             goto err;
2544         }
2545
2546         /*
2547          * We always return 1 bytes because the RDMA
2548          * protocol is completely asynchronous. We do not yet know
2549          * whether an  identified chunk is zero or not because we're
2550          * waiting for other pages to potentially be merged with
2551          * the current chunk. So, we have to call qemu_update_position()
2552          * later on when the actual write occurs.
2553          */
2554         if (bytes_sent) {
2555             *bytes_sent = 1;
2556         }
2557     } else {
2558         uint64_t index, chunk;
2559
2560         /* TODO: Change QEMUFileOps prototype to be signed: size_t => long
2561         if (size < 0) {
2562             ret = qemu_rdma_drain_cq(f, rdma);
2563             if (ret < 0) {
2564                 fprintf(stderr, "rdma: failed to synchronously drain"
2565                                 " completion queue before unregistration.\n");
2566                 goto err;
2567             }
2568         }
2569         */
2570
2571         ret = qemu_rdma_search_ram_block(rdma, block_offset,
2572                                          offset, size, &index, &chunk);
2573
2574         if (ret) {
2575             fprintf(stderr, "ram block search failed\n");
2576             goto err;
2577         }
2578
2579         qemu_rdma_signal_unregister(rdma, index, chunk, 0);
2580
2581         /*
2582          * TODO: Synchronous, guaranteed unregistration (should not occur during
2583          * fast-path). Otherwise, unregisters will process on the next call to
2584          * qemu_rdma_drain_cq()
2585         if (size < 0) {
2586             qemu_rdma_unregister_waiting(rdma);
2587         }
2588         */
2589     }
2590
2591     /*
2592      * Drain the Completion Queue if possible, but do not block,
2593      * just poll.
2594      *
2595      * If nothing to poll, the end of the iteration will do this
2596      * again to make sure we don't overflow the request queue.
2597      */
2598     while (1) {
2599         uint64_t wr_id, wr_id_in;
2600         int ret = qemu_rdma_poll(rdma, &wr_id_in);
2601         if (ret < 0) {
2602             fprintf(stderr, "rdma migration: polling error! %d\n", ret);
2603             goto err;
2604         }
2605
2606         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
2607
2608         if (wr_id == RDMA_WRID_NONE) {
2609             break;
2610         }
2611     }
2612
2613     return RAM_SAVE_CONTROL_DELAYED;
2614 err:
2615     rdma->error_state = ret;
2616     return ret;
2617 }
2618
2619 static int qemu_rdma_accept(RDMAContext *rdma)
2620 {
2621     RDMACapabilities cap;
2622     struct rdma_conn_param conn_param = {
2623                                             .responder_resources = 2,
2624                                             .private_data = &cap,
2625                                             .private_data_len = sizeof(cap),
2626                                          };
2627     struct rdma_cm_event *cm_event;
2628     struct ibv_context *verbs;
2629     int ret = -EINVAL;
2630     int idx;
2631
2632     ret = rdma_get_cm_event(rdma->channel, &cm_event);
2633     if (ret) {
2634         goto err_rdma_dest_wait;
2635     }
2636
2637     if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
2638         rdma_ack_cm_event(cm_event);
2639         goto err_rdma_dest_wait;
2640     }
2641
2642     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2643
2644     network_to_caps(&cap);
2645
2646     if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
2647             fprintf(stderr, "Unknown source RDMA version: %d, bailing...\n",
2648                             cap.version);
2649             rdma_ack_cm_event(cm_event);
2650             goto err_rdma_dest_wait;
2651     }
2652
2653     /*
2654      * Respond with only the capabilities this version of QEMU knows about.
2655      */
2656     cap.flags &= known_capabilities;
2657
2658     /*
2659      * Enable the ones that we do know about.
2660      * Add other checks here as new ones are introduced.
2661      */
2662     if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
2663         rdma->pin_all = true;
2664     }
2665
2666     rdma->cm_id = cm_event->id;
2667     verbs = cm_event->id->verbs;
2668
2669     rdma_ack_cm_event(cm_event);
2670
2671     DPRINTF("Memory pin all: %s\n", rdma->pin_all ? "enabled" : "disabled");
2672
2673     caps_to_network(&cap);
2674
2675     DPRINTF("verbs context after listen: %p\n", verbs);
2676
2677     if (!rdma->verbs) {
2678         rdma->verbs = verbs;
2679     } else if (rdma->verbs != verbs) {
2680             fprintf(stderr, "ibv context not matching %p, %p!\n",
2681                     rdma->verbs, verbs);
2682             goto err_rdma_dest_wait;
2683     }
2684
2685     qemu_rdma_dump_id("dest_init", verbs);
2686
2687     ret = qemu_rdma_alloc_pd_cq(rdma);
2688     if (ret) {
2689         fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
2690         goto err_rdma_dest_wait;
2691     }
2692
2693     ret = qemu_rdma_alloc_qp(rdma);
2694     if (ret) {
2695         fprintf(stderr, "rdma migration: error allocating qp!\n");
2696         goto err_rdma_dest_wait;
2697     }
2698
2699     ret = qemu_rdma_init_ram_blocks(rdma);
2700     if (ret) {
2701         fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
2702         goto err_rdma_dest_wait;
2703     }
2704
2705     for (idx = 0; idx <= RDMA_WRID_MAX; idx++) {
2706         ret = qemu_rdma_reg_control(rdma, idx);
2707         if (ret) {
2708             fprintf(stderr, "rdma: error registering %d control!\n", idx);
2709             goto err_rdma_dest_wait;
2710         }
2711     }
2712
2713     qemu_set_fd_handler2(rdma->channel->fd, NULL, NULL, NULL, NULL);
2714
2715     ret = rdma_accept(rdma->cm_id, &conn_param);
2716     if (ret) {
2717         fprintf(stderr, "rdma_accept returns %d!\n", ret);
2718         goto err_rdma_dest_wait;
2719     }
2720
2721     ret = rdma_get_cm_event(rdma->channel, &cm_event);
2722     if (ret) {
2723         fprintf(stderr, "rdma_accept get_cm_event failed %d!\n", ret);
2724         goto err_rdma_dest_wait;
2725     }
2726
2727     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2728         fprintf(stderr, "rdma_accept not event established!\n");
2729         rdma_ack_cm_event(cm_event);
2730         goto err_rdma_dest_wait;
2731     }
2732
2733     rdma_ack_cm_event(cm_event);
2734
2735     ret = qemu_rdma_post_recv_control(rdma, 0);
2736     if (ret) {
2737         fprintf(stderr, "rdma migration: error posting second control recv!\n");
2738         goto err_rdma_dest_wait;
2739     }
2740
2741     qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
2742
2743     return 0;
2744
2745 err_rdma_dest_wait:
2746     rdma->error_state = ret;
2747     qemu_rdma_cleanup(rdma);
2748     return ret;
2749 }
2750
2751 /*
2752  * During each iteration of the migration, we listen for instructions
2753  * by the source VM to perform dynamic page registrations before they
2754  * can perform RDMA operations.
2755  *
2756  * We respond with the 'rkey'.
2757  *
2758  * Keep doing this until the source tells us to stop.
2759  */
2760 static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
2761                                          uint64_t flags)
2762 {
2763     RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
2764                                .type = RDMA_CONTROL_REGISTER_RESULT,
2765                                .repeat = 0,
2766                              };
2767     RDMAControlHeader unreg_resp = { .len = 0,
2768                                .type = RDMA_CONTROL_UNREGISTER_FINISHED,
2769                                .repeat = 0,
2770                              };
2771     RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
2772                                  .repeat = 1 };
2773     QEMUFileRDMA *rfile = opaque;
2774     RDMAContext *rdma = rfile->rdma;
2775     RDMALocalBlocks *local = &rdma->local_ram_blocks;
2776     RDMAControlHeader head;
2777     RDMARegister *reg, *registers;
2778     RDMACompress *comp;
2779     RDMARegisterResult *reg_result;
2780     static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
2781     RDMALocalBlock *block;
2782     void *host_addr;
2783     int ret = 0;
2784     int idx = 0;
2785     int count = 0;
2786     int i = 0;
2787
2788     CHECK_ERROR_STATE();
2789
2790     do {
2791         DDDPRINTF("Waiting for next request %" PRIu64 "...\n", flags);
2792
2793         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
2794
2795         if (ret < 0) {
2796             break;
2797         }
2798
2799         if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
2800             fprintf(stderr, "rdma: Too many requests in this message (%d)."
2801                             "Bailing.\n", head.repeat);
2802             ret = -EIO;
2803             break;
2804         }
2805
2806         switch (head.type) {
2807         case RDMA_CONTROL_COMPRESS:
2808             comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
2809             network_to_compress(comp);
2810
2811             DDPRINTF("Zapping zero chunk: %" PRId64
2812                     " bytes, index %d, offset %" PRId64 "\n",
2813                     comp->length, comp->block_idx, comp->offset);
2814             block = &(rdma->local_ram_blocks.block[comp->block_idx]);
2815
2816             host_addr = block->local_host_addr +
2817                             (comp->offset - block->offset);
2818
2819             ram_handle_compressed(host_addr, comp->value, comp->length);
2820             break;
2821
2822         case RDMA_CONTROL_REGISTER_FINISHED:
2823             DDDPRINTF("Current registrations complete.\n");
2824             goto out;
2825
2826         case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
2827             DPRINTF("Initial setup info requested.\n");
2828
2829             if (rdma->pin_all) {
2830                 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
2831                 if (ret) {
2832                     fprintf(stderr, "rdma migration: error dest "
2833                                     "registering ram blocks!\n");
2834                     goto out;
2835                 }
2836             }
2837
2838             /*
2839              * Dest uses this to prepare to transmit the RAMBlock descriptions
2840              * to the source VM after connection setup.
2841              * Both sides use the "remote" structure to communicate and update
2842              * their "local" descriptions with what was sent.
2843              */
2844             for (i = 0; i < local->nb_blocks; i++) {
2845                 rdma->block[i].remote_host_addr =
2846                     (uint64_t)(local->block[i].local_host_addr);
2847
2848                 if (rdma->pin_all) {
2849                     rdma->block[i].remote_rkey = local->block[i].mr->rkey;
2850                 }
2851
2852                 rdma->block[i].offset = local->block[i].offset;
2853                 rdma->block[i].length = local->block[i].length;
2854
2855                 remote_block_to_network(&rdma->block[i]);
2856             }
2857
2858             blocks.len = rdma->local_ram_blocks.nb_blocks
2859                                                 * sizeof(RDMARemoteBlock);
2860
2861
2862             ret = qemu_rdma_post_send_control(rdma,
2863                                         (uint8_t *) rdma->block, &blocks);
2864
2865             if (ret < 0) {
2866                 fprintf(stderr, "rdma migration: error sending remote info!\n");
2867                 goto out;
2868             }
2869
2870             break;
2871         case RDMA_CONTROL_REGISTER_REQUEST:
2872             DDPRINTF("There are %d registration requests\n", head.repeat);
2873
2874             reg_resp.repeat = head.repeat;
2875             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
2876
2877             for (count = 0; count < head.repeat; count++) {
2878                 uint64_t chunk;
2879                 uint8_t *chunk_start, *chunk_end;
2880
2881                 reg = &registers[count];
2882                 network_to_register(reg);
2883
2884                 reg_result = &results[count];
2885
2886                 DDPRINTF("Registration request (%d): index %d, current_addr %"
2887                          PRIu64 " chunks: %" PRIu64 "\n", count,
2888                          reg->current_index, reg->key.current_addr, reg->chunks);
2889
2890                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
2891                 if (block->is_ram_block) {
2892                     host_addr = (block->local_host_addr +
2893                                 (reg->key.current_addr - block->offset));
2894                     chunk = ram_chunk_index(block->local_host_addr,
2895                                             (uint8_t *) host_addr);
2896                 } else {
2897                     chunk = reg->key.chunk;
2898                     host_addr = block->local_host_addr +
2899                         (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
2900                 }
2901                 chunk_start = ram_chunk_start(block, chunk);
2902                 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
2903                 if (qemu_rdma_register_and_get_keys(rdma, block,
2904                             (uint8_t *)host_addr, NULL, &reg_result->rkey,
2905                             chunk, chunk_start, chunk_end)) {
2906                     fprintf(stderr, "cannot get rkey!\n");
2907                     ret = -EINVAL;
2908                     goto out;
2909                 }
2910
2911                 reg_result->host_addr = (uint64_t) block->local_host_addr;
2912
2913                 DDPRINTF("Registered rkey for this request: %x\n",
2914                                 reg_result->rkey);
2915
2916                 result_to_network(reg_result);
2917             }
2918
2919             ret = qemu_rdma_post_send_control(rdma,
2920                             (uint8_t *) results, &reg_resp);
2921
2922             if (ret < 0) {
2923                 fprintf(stderr, "Failed to send control buffer!\n");
2924                 goto out;
2925             }
2926             break;
2927         case RDMA_CONTROL_UNREGISTER_REQUEST:
2928             DDPRINTF("There are %d unregistration requests\n", head.repeat);
2929             unreg_resp.repeat = head.repeat;
2930             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
2931
2932             for (count = 0; count < head.repeat; count++) {
2933                 reg = &registers[count];
2934                 network_to_register(reg);
2935
2936                 DDPRINTF("Unregistration request (%d): "
2937                          " index %d, chunk %" PRIu64 "\n",
2938                          count, reg->current_index, reg->key.chunk);
2939
2940                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
2941
2942                 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
2943                 block->pmr[reg->key.chunk] = NULL;
2944
2945                 if (ret != 0) {
2946                     perror("rdma unregistration chunk failed");
2947                     ret = -ret;
2948                     goto out;
2949                 }
2950
2951                 rdma->total_registrations--;
2952
2953                 DDPRINTF("Unregistered chunk %" PRIu64 " successfully.\n",
2954                             reg->key.chunk);
2955             }
2956
2957             ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
2958
2959             if (ret < 0) {
2960                 fprintf(stderr, "Failed to send control buffer!\n");
2961                 goto out;
2962             }
2963             break;
2964         case RDMA_CONTROL_REGISTER_RESULT:
2965             fprintf(stderr, "Invalid RESULT message at dest.\n");
2966             ret = -EIO;
2967             goto out;
2968         default:
2969             fprintf(stderr, "Unknown control message %s\n",
2970                                 control_desc[head.type]);
2971             ret = -EIO;
2972             goto out;
2973         }
2974     } while (1);
2975 out:
2976     if (ret < 0) {
2977         rdma->error_state = ret;
2978     }
2979     return ret;
2980 }
2981
2982 static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
2983                                         uint64_t flags)
2984 {
2985     QEMUFileRDMA *rfile = opaque;
2986     RDMAContext *rdma = rfile->rdma;
2987
2988     CHECK_ERROR_STATE();
2989
2990     DDDPRINTF("start section: %" PRIu64 "\n", flags);
2991     qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
2992     qemu_fflush(f);
2993
2994     return 0;
2995 }
2996
2997 /*
2998  * Inform dest that dynamic registrations are done for now.
2999  * First, flush writes, if any.
3000  */
3001 static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3002                                        uint64_t flags)
3003 {
3004     Error *local_err = NULL, **errp = &local_err;
3005     QEMUFileRDMA *rfile = opaque;
3006     RDMAContext *rdma = rfile->rdma;
3007     RDMAControlHeader head = { .len = 0, .repeat = 1 };
3008     int ret = 0;
3009
3010     CHECK_ERROR_STATE();
3011
3012     qemu_fflush(f);
3013     ret = qemu_rdma_drain_cq(f, rdma);
3014
3015     if (ret < 0) {
3016         goto err;
3017     }
3018
3019     if (flags == RAM_CONTROL_SETUP) {
3020         RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3021         RDMALocalBlocks *local = &rdma->local_ram_blocks;
3022         int reg_result_idx, i, j, nb_remote_blocks;
3023
3024         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3025         DPRINTF("Sending registration setup for ram blocks...\n");
3026
3027         /*
3028          * Make sure that we parallelize the pinning on both sides.
3029          * For very large guests, doing this serially takes a really
3030          * long time, so we have to 'interleave' the pinning locally
3031          * with the control messages by performing the pinning on this
3032          * side before we receive the control response from the other
3033          * side that the pinning has completed.
3034          */
3035         ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3036                     &reg_result_idx, rdma->pin_all ?
3037                     qemu_rdma_reg_whole_ram_blocks : NULL);
3038         if (ret < 0) {
3039             ERROR(errp, "receiving remote info!\n");
3040             return ret;
3041         }
3042
3043         qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3044         memcpy(rdma->block,
3045             rdma->wr_data[reg_result_idx].control_curr, resp.len);
3046
3047         nb_remote_blocks = resp.len / sizeof(RDMARemoteBlock);
3048
3049         /*
3050          * The protocol uses two different sets of rkeys (mutually exclusive):
3051          * 1. One key to represent the virtual address of the entire ram block.
3052          *    (dynamic chunk registration disabled - pin everything with one rkey.)
3053          * 2. One to represent individual chunks within a ram block.
3054          *    (dynamic chunk registration enabled - pin individual chunks.)
3055          *
3056          * Once the capability is successfully negotiated, the destination transmits
3057          * the keys to use (or sends them later) including the virtual addresses
3058          * and then propagates the remote ram block descriptions to his local copy.
3059          */
3060
3061         if (local->nb_blocks != nb_remote_blocks) {
3062             ERROR(errp, "ram blocks mismatch #1! "
3063                         "Your QEMU command line parameters are probably "
3064                         "not identical on both the source and destination.\n");
3065             return -EINVAL;
3066         }
3067
3068         for (i = 0; i < nb_remote_blocks; i++) {
3069             network_to_remote_block(&rdma->block[i]);
3070
3071             /* search local ram blocks */
3072             for (j = 0; j < local->nb_blocks; j++) {
3073                 if (rdma->block[i].offset != local->block[j].offset) {
3074                     continue;
3075                 }
3076
3077                 if (rdma->block[i].length != local->block[j].length) {
3078                     ERROR(errp, "ram blocks mismatch #2! "
3079                         "Your QEMU command line parameters are probably "
3080                         "not identical on both the source and destination.\n");
3081                     return -EINVAL;
3082                 }
3083                 local->block[j].remote_host_addr =
3084                         rdma->block[i].remote_host_addr;
3085                 local->block[j].remote_rkey = rdma->block[i].remote_rkey;
3086                 break;
3087             }
3088
3089             if (j >= local->nb_blocks) {
3090                 ERROR(errp, "ram blocks mismatch #3! "
3091                         "Your QEMU command line parameters are probably "
3092                         "not identical on both the source and destination.\n");
3093                 return -EINVAL;
3094             }
3095         }
3096     }
3097
3098     DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags);
3099
3100     head.type = RDMA_CONTROL_REGISTER_FINISHED;
3101     ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3102
3103     if (ret < 0) {
3104         goto err;
3105     }
3106
3107     return 0;
3108 err:
3109     rdma->error_state = ret;
3110     return ret;
3111 }
3112
3113 static int qemu_rdma_get_fd(void *opaque)
3114 {
3115     QEMUFileRDMA *rfile = opaque;
3116     RDMAContext *rdma = rfile->rdma;
3117
3118     return rdma->comp_channel->fd;
3119 }
3120
3121 const QEMUFileOps rdma_read_ops = {
3122     .get_buffer    = qemu_rdma_get_buffer,
3123     .get_fd        = qemu_rdma_get_fd,
3124     .close         = qemu_rdma_close,
3125     .hook_ram_load = qemu_rdma_registration_handle,
3126 };
3127
3128 const QEMUFileOps rdma_write_ops = {
3129     .put_buffer         = qemu_rdma_put_buffer,
3130     .close              = qemu_rdma_close,
3131     .before_ram_iterate = qemu_rdma_registration_start,
3132     .after_ram_iterate  = qemu_rdma_registration_stop,
3133     .save_page          = qemu_rdma_save_page,
3134 };
3135
3136 static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3137 {
3138     QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
3139
3140     if (qemu_file_mode_is_not_valid(mode)) {
3141         return NULL;
3142     }
3143
3144     r->rdma = rdma;
3145
3146     if (mode[0] == 'w') {
3147         r->file = qemu_fopen_ops(r, &rdma_write_ops);
3148     } else {
3149         r->file = qemu_fopen_ops(r, &rdma_read_ops);
3150     }
3151
3152     return r->file;
3153 }
3154
3155 static void rdma_accept_incoming_migration(void *opaque)
3156 {
3157     RDMAContext *rdma = opaque;
3158     int ret;
3159     QEMUFile *f;
3160     Error *local_err = NULL, **errp = &local_err;
3161
3162     DPRINTF("Accepting rdma connection...\n");
3163     ret = qemu_rdma_accept(rdma);
3164
3165     if (ret) {
3166         ERROR(errp, "RDMA Migration initialization failed!\n");
3167         return;
3168     }
3169
3170     DPRINTF("Accepted migration\n");
3171
3172     f = qemu_fopen_rdma(rdma, "rb");
3173     if (f == NULL) {
3174         ERROR(errp, "could not qemu_fopen_rdma!\n");
3175         qemu_rdma_cleanup(rdma);
3176         return;
3177     }
3178
3179     rdma->migration_started_on_destination = 1;
3180     process_incoming_migration(f);
3181 }
3182
3183 void rdma_start_incoming_migration(const char *host_port, Error **errp)
3184 {
3185     int ret;
3186     RDMAContext *rdma;
3187     Error *local_err = NULL;
3188
3189     DPRINTF("Starting RDMA-based incoming migration\n");
3190     rdma = qemu_rdma_data_init(host_port, &local_err);
3191
3192     if (rdma == NULL) {
3193         goto err;
3194     }
3195
3196     ret = qemu_rdma_dest_init(rdma, &local_err);
3197
3198     if (ret) {
3199         goto err;
3200     }
3201
3202     DPRINTF("qemu_rdma_dest_init success\n");
3203
3204     ret = rdma_listen(rdma->listen_id, 5);
3205
3206     if (ret) {
3207         ERROR(errp, "listening on socket!\n");
3208         goto err;
3209     }
3210
3211     DPRINTF("rdma_listen success\n");
3212
3213     qemu_set_fd_handler2(rdma->channel->fd, NULL,
3214                          rdma_accept_incoming_migration, NULL,
3215                             (void *)(intptr_t) rdma);
3216     return;
3217 err:
3218     error_propagate(errp, local_err);
3219     g_free(rdma);
3220 }
3221
3222 void rdma_start_outgoing_migration(void *opaque,
3223                             const char *host_port, Error **errp)
3224 {
3225     MigrationState *s = opaque;
3226     Error *local_err = NULL, **temp = &local_err;
3227     RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err);
3228     int ret = 0;
3229
3230     if (rdma == NULL) {
3231         ERROR(temp, "Failed to initialize RDMA data structures! %d\n", ret);
3232         goto err;
3233     }
3234
3235     ret = qemu_rdma_source_init(rdma, &local_err,
3236         s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]);
3237
3238     if (ret) {
3239         goto err;
3240     }
3241
3242     DPRINTF("qemu_rdma_source_init success\n");
3243     ret = qemu_rdma_connect(rdma, &local_err);
3244
3245     if (ret) {
3246         goto err;
3247     }
3248
3249     DPRINTF("qemu_rdma_source_connect success\n");
3250
3251     s->file = qemu_fopen_rdma(rdma, "wb");
3252     migrate_fd_connect(s);
3253     return;
3254 err:
3255     error_propagate(errp, local_err);
3256     g_free(rdma);
3257     migrate_fd_error(s);
3258 }