drivers/infiniband/hw/hfi1/pio.c

   1 /*
   2  * Copyright(c) 2015-2017 Intel Corporation.
   3  *
   4  * This file is provided under a dual BSD/GPLv2 license.  When using or
   5  * redistributing this file, you may do so under either license.
   6  *
   7  * GPL LICENSE SUMMARY
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of version 2 of the GNU General Public License as
  11  * published by the Free Software Foundation.
  12  *
  13  * This program is distributed in the hope that it will be useful, but
  14  * WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * General Public License for more details.
  17  *
  18  * BSD LICENSE
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  *
  24  *  - Redistributions of source code must retain the above copyright
  25  *    notice, this list of conditions and the following disclaimer.
  26  *  - Redistributions in binary form must reproduce the above copyright
  27  *    notice, this list of conditions and the following disclaimer in
  28  *    the documentation and/or other materials provided with the
  29  *    distribution.
  30  *  - Neither the name of Intel Corporation nor the names of its
  31  *    contributors may be used to endorse or promote products derived
  32  *    from this software without specific prior written permission.
  33  *
  34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45  *
  46  */
  47
  48 #include <linux/delay.h>
  49 #include "hfi.h"
  50 #include "qp.h"
  51 #include "trace.h"
  52
  53 #define SC(name) SEND_CTXT_##name
  54 /*
  55  * Send Context functions
  56  */
  57 static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
  58
  59 /*
  60  * Set the CM reset bit and wait for it to clear.  Use the provided
  61  * sendctrl register.  This routine has no locking.
  62  */
  63 void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
  64 {
  65         write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
  66         while (1) {
  67                 udelay(1);
  68                 sendctrl = read_csr(dd, SEND_CTRL);
  69                 if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
  70                         break;
  71         }
  72 }
  73
  74 /* defined in header release 48 and higher */
  75 #ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
  76 #define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
  77 #define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
  78 #define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
  79                 << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
  80 #endif
  81
  82 /* global control of PIO send */
  83 void pio_send_control(struct hfi1_devdata *dd, int op)
  84 {
  85         u64 reg, mask;
  86         unsigned long flags;
  87         int write = 1;  /* write sendctrl back */
  88         int flush = 0;  /* re-read sendctrl to make sure it is flushed */
  89
  90         spin_lock_irqsave(&dd->sendctrl_lock, flags);
  91
  92         reg = read_csr(dd, SEND_CTRL);
  93         switch (op) {
  94         case PSC_GLOBAL_ENABLE:
  95                 reg |= SEND_CTRL_SEND_ENABLE_SMASK;
  96         /* Fall through */
  97         case PSC_DATA_VL_ENABLE:
  98                 /* Disallow sending on VLs not enabled */
  99                 mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
 100                                 SEND_CTRL_UNSUPPORTED_VL_SHIFT;
 101                 reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
 102                 break;
 103         case PSC_GLOBAL_DISABLE:
 104                 reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
 105                 break;
 106         case PSC_GLOBAL_VLARB_ENABLE:
 107                 reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
 108                 break;
 109         case PSC_GLOBAL_VLARB_DISABLE:
 110                 reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
 111                 break;
 112         case PSC_CM_RESET:
 113                 __cm_reset(dd, reg);
 114                 write = 0; /* CSR already written (and flushed) */
 115                 break;
 116         case PSC_DATA_VL_DISABLE:
 117                 reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
 118                 flush = 1;
 119                 break;
 120         default:
 121                 dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
 122                 break;
 123         }
 124
 125         if (write) {
 126                 write_csr(dd, SEND_CTRL, reg);
 127                 if (flush)
 128                         (void)read_csr(dd, SEND_CTRL); /* flush write */
 129         }
 130
 131         spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
 132 }
 133
 134 /* number of send context memory pools */
 135 #define NUM_SC_POOLS 2
 136
 137 /* Send Context Size (SCS) wildcards */
 138 #define SCS_POOL_0 -1
 139 #define SCS_POOL_1 -2
 140
 141 /* Send Context Count (SCC) wildcards */
 142 #define SCC_PER_VL -1
 143 #define SCC_PER_CPU  -2
 144 #define SCC_PER_KRCVQ  -3
 145
 146 /* Send Context Size (SCS) constants */
 147 #define SCS_ACK_CREDITS  32
 148 #define SCS_VL15_CREDITS 102    /* 3 pkts of 2048B data + 128B header */
 149
 150 #define PIO_THRESHOLD_CEILING 4096
 151
 152 #define PIO_WAIT_BATCH_SIZE 5
 153
 154 /* default send context sizes */
 155 static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
 156         [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
 157                         .count = SCC_PER_VL },  /* one per NUMA */
 158         [SC_ACK]    = { .size  = SCS_ACK_CREDITS,
 159                         .count = SCC_PER_KRCVQ },
 160         [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
 161                         .count = SCC_PER_CPU }, /* one per CPU */
 162         [SC_VL15]   = { .size  = SCS_VL15_CREDITS,
 163                         .count = 1 },
 164
 165 };
 166
 167 /* send context memory pool configuration */
 168 struct mem_pool_config {
 169         int centipercent;       /* % of memory, in 100ths of 1% */
 170         int absolute_blocks;    /* absolute block count */
 171 };
 172
 173 /* default memory pool configuration: 100% in pool 0 */
 174 static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
 175         /* centi%, abs blocks */
 176         {  10000,     -1 },             /* pool 0 */
 177         {      0,     -1 },             /* pool 1 */
 178 };
 179
 180 /* memory pool information, used when calculating final sizes */
 181 struct mem_pool_info {
 182         int centipercent;       /*
 183                                  * 100th of 1% of memory to use, -1 if blocks
 184                                  * already set
 185                                  */
 186         int count;              /* count of contexts in the pool */
 187         int blocks;             /* block size of the pool */
 188         int size;               /* context size, in blocks */
 189 };
 190
 191 /*
 192  * Convert a pool wildcard to a valid pool index.  The wildcards
 193  * start at -1 and increase negatively.  Map them as:
 194  *      -1 => 0
 195  *      -2 => 1
 196  *      etc.
 197  *
 198  * Return -1 on non-wildcard input, otherwise convert to a pool number.
 199  */
 200 static int wildcard_to_pool(int wc)
 201 {
 202         if (wc >= 0)
 203                 return -1;      /* non-wildcard */
 204         return -wc - 1;
 205 }
 206
 207 static const char *sc_type_names[SC_MAX] = {
 208         "kernel",
 209         "ack",
 210         "user",
 211         "vl15"
 212 };
 213
 214 static const char *sc_type_name(int index)
 215 {
 216         if (index < 0 || index >= SC_MAX)
 217                 return "unknown";
 218         return sc_type_names[index];
 219 }
 220
 221 /*
 222  * Read the send context memory pool configuration and send context
 223  * size configuration.  Replace any wildcards and come up with final
 224  * counts and sizes for the send context types.
 225  */
 226 int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
 227 {
 228         struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
 229         int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
 230         int total_contexts = 0;
 231         int fixed_blocks;
 232         int pool_blocks;
 233         int used_blocks;
 234         int cp_total;           /* centipercent total */
 235         int ab_total;           /* absolute block total */
 236         int extra;
 237         int i;
 238
 239         /*
 240          * When SDMA is enabled, kernel context pio packet size is capped by
 241          * "piothreshold". Reduce pio buffer allocation for kernel context by
 242          * setting it to a fixed size. The allocation allows 3-deep buffering
 243          * of the largest pio packets plus up to 128 bytes header, sufficient
 244          * to maintain verbs performance.
 245          *
 246          * When SDMA is disabled, keep the default pooling allocation.
 247          */
 248         if (HFI1_CAP_IS_KSET(SDMA)) {
 249                 u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
 250                                          piothreshold : PIO_THRESHOLD_CEILING;
 251                 sc_config_sizes[SC_KERNEL].size =
 252                         3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
 253         }
 254
 255         /*
 256          * Step 0:
 257          *      - copy the centipercents/absolute sizes from the pool config
 258          *      - sanity check these values
 259          *      - add up centipercents, then later check for full value
 260          *      - add up absolute blocks, then later check for over-commit
 261          */
 262         cp_total = 0;
 263         ab_total = 0;
 264         for (i = 0; i < NUM_SC_POOLS; i++) {
 265                 int cp = sc_mem_pool_config[i].centipercent;
 266                 int ab = sc_mem_pool_config[i].absolute_blocks;
 267
 268                 /*
 269                  * A negative value is "unused" or "invalid".  Both *can*
 270                  * be valid, but centipercent wins, so check that first
 271                  */
 272                 if (cp >= 0) {                  /* centipercent valid */
 273                         cp_total += cp;
 274                 } else if (ab >= 0) {           /* absolute blocks valid */
 275                         ab_total += ab;
 276                 } else {                        /* neither valid */
 277                         dd_dev_err(
 278                                 dd,
 279                                 "Send context memory pool %d: both the block count and centipercent are invalid\n",
 280                                 i);
 281                         return -EINVAL;
 282                 }
 283
 284                 mem_pool_info[i].centipercent = cp;
 285                 mem_pool_info[i].blocks = ab;
 286         }
 287
 288         /* do not use both % and absolute blocks for different pools */
 289         if (cp_total != 0 && ab_total != 0) {
 290                 dd_dev_err(
 291                         dd,
 292                         "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
 293                 return -EINVAL;
 294         }
 295
 296         /* if any percentages are present, they must add up to 100% x 100 */
 297         if (cp_total != 0 && cp_total != 10000) {
 298                 dd_dev_err(
 299                         dd,
 300                         "Send context memory pool centipercent is %d, expecting 10000\n",
 301                         cp_total);
 302                 return -EINVAL;
 303         }
 304
 305         /* the absolute pool total cannot be more than the mem total */
 306         if (ab_total > total_blocks) {
 307                 dd_dev_err(
 308                         dd,
 309                         "Send context memory pool absolute block count %d is larger than the memory size %d\n",
 310                         ab_total, total_blocks);
 311                 return -EINVAL;
 312         }
 313
 314         /*
 315          * Step 2:
 316          *      - copy from the context size config
 317          *      - replace context type wildcard counts with real values
 318          *      - add up non-memory pool block sizes
 319          *      - add up memory pool user counts
 320          */
 321         fixed_blocks = 0;
 322         for (i = 0; i < SC_MAX; i++) {
 323                 int count = sc_config_sizes[i].count;
 324                 int size = sc_config_sizes[i].size;
 325                 int pool;
 326
 327                 /*
 328                  * Sanity check count: Either a positive value or
 329                  * one of the expected wildcards is valid.  The positive
 330                  * value is checked later when we compare against total
 331                  * memory available.
 332                  */
 333                 if (i == SC_ACK) {
 334                         count = dd->n_krcv_queues;
 335                 } else if (i == SC_KERNEL) {
 336                         count = INIT_SC_PER_VL * num_vls;
 337                 } else if (count == SCC_PER_CPU) {
 338                         count = dd->num_rcv_contexts - dd->n_krcv_queues;
 339                 } else if (count < 0) {
 340                         dd_dev_err(
 341                                 dd,
 342                                 "%s send context invalid count wildcard %d\n",
 343                                 sc_type_name(i), count);
 344                         return -EINVAL;
 345                 }
 346                 if (total_contexts + count > dd->chip_send_contexts)
 347                         count = dd->chip_send_contexts - total_contexts;
 348
 349                 total_contexts += count;
 350
 351                 /*
 352                  * Sanity check pool: The conversion will return a pool
 353                  * number or -1 if a fixed (non-negative) value.  The fixed
 354                  * value is checked later when we compare against
 355                  * total memory available.
 356                  */
 357                 pool = wildcard_to_pool(size);
 358                 if (pool == -1) {                       /* non-wildcard */
 359                         fixed_blocks += size * count;
 360                 } else if (pool < NUM_SC_POOLS) {       /* valid wildcard */
 361                         mem_pool_info[pool].count += count;
 362                 } else {                                /* invalid wildcard */
 363                         dd_dev_err(
 364                                 dd,
 365                                 "%s send context invalid pool wildcard %d\n",
 366                                 sc_type_name(i), size);
 367                         return -EINVAL;
 368                 }
 369
 370                 dd->sc_sizes[i].count = count;
 371                 dd->sc_sizes[i].size = size;
 372         }
 373         if (fixed_blocks > total_blocks) {
 374                 dd_dev_err(
 375                         dd,
 376                         "Send context fixed block count, %u, larger than total block count %u\n",
 377                         fixed_blocks, total_blocks);
 378                 return -EINVAL;
 379         }
 380
 381         /* step 3: calculate the blocks in the pools, and pool context sizes */
 382         pool_blocks = total_blocks - fixed_blocks;
 383         if (ab_total > pool_blocks) {
 384                 dd_dev_err(
 385                         dd,
 386                         "Send context fixed pool sizes, %u, larger than pool block count %u\n",
 387                         ab_total, pool_blocks);
 388                 return -EINVAL;
 389         }
 390         /* subtract off the fixed pool blocks */
 391         pool_blocks -= ab_total;
 392
 393         for (i = 0; i < NUM_SC_POOLS; i++) {
 394                 struct mem_pool_info *pi = &mem_pool_info[i];
 395
 396                 /* % beats absolute blocks */
 397                 if (pi->centipercent >= 0)
 398                         pi->blocks = (pool_blocks * pi->centipercent) / 10000;
 399
 400                 if (pi->blocks == 0 && pi->count != 0) {
 401                         dd_dev_err(
 402                                 dd,
 403                                 "Send context memory pool %d has %u contexts, but no blocks\n",
 404                                 i, pi->count);
 405                         return -EINVAL;
 406                 }
 407                 if (pi->count == 0) {
 408                         /* warn about wasted blocks */
 409                         if (pi->blocks != 0)
 410                                 dd_dev_err(
 411                                         dd,
 412                                         "Send context memory pool %d has %u blocks, but zero contexts\n",
 413                                         i, pi->blocks);
 414                         pi->size = 0;
 415                 } else {
 416                         pi->size = pi->blocks / pi->count;
 417                 }
 418         }
 419
 420         /* step 4: fill in the context type sizes from the pool sizes */
 421         used_blocks = 0;
 422         for (i = 0; i < SC_MAX; i++) {
 423                 if (dd->sc_sizes[i].size < 0) {
 424                         unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
 425
 426                         WARN_ON_ONCE(pool >= NUM_SC_POOLS);
 427                         dd->sc_sizes[i].size = mem_pool_info[pool].size;
 428                 }
 429                 /* make sure we are not larger than what is allowed by the HW */
 430 #define PIO_MAX_BLOCKS 1024
 431                 if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
 432                         dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
 433
 434                 /* calculate our total usage */
 435                 used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
 436         }
 437         extra = total_blocks - used_blocks;
 438         if (extra != 0)
 439                 dd_dev_info(dd, "unused send context blocks: %d\n", extra);
 440
 441         return total_contexts;
 442 }
 443
 444 int init_send_contexts(struct hfi1_devdata *dd)
 445 {
 446         u16 base;
 447         int ret, i, j, context;
 448
 449         ret = init_credit_return(dd);
 450         if (ret)
 451                 return ret;
 452
 453         dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
 454                                         GFP_KERNEL);
 455         dd->send_contexts = kcalloc(dd->num_send_contexts,
 456                                         sizeof(struct send_context_info),
 457                                         GFP_KERNEL);
 458         if (!dd->send_contexts || !dd->hw_to_sw) {
 459                 kfree(dd->hw_to_sw);
 460                 kfree(dd->send_contexts);
 461                 free_credit_return(dd);
 462                 return -ENOMEM;
 463         }
 464
 465         /* hardware context map starts with invalid send context indices */
 466         for (i = 0; i < TXE_NUM_CONTEXTS; i++)
 467                 dd->hw_to_sw[i] = INVALID_SCI;
 468
 469         /*
 470          * All send contexts have their credit sizes.  Allocate credits
 471          * for each context one after another from the global space.
 472          */
 473         context = 0;
 474         base = 1;
 475         for (i = 0; i < SC_MAX; i++) {
 476                 struct sc_config_sizes *scs = &dd->sc_sizes[i];
 477
 478                 for (j = 0; j < scs->count; j++) {
 479                         struct send_context_info *sci =
 480                                                 &dd->send_contexts[context];
 481                         sci->type = i;
 482                         sci->base = base;
 483                         sci->credits = scs->size;
 484
 485                         context++;
 486                         base += scs->size;
 487                 }
 488         }
 489
 490         return 0;
 491 }
 492
 493 /*
 494  * Allocate a software index and hardware context of the given type.
 495  *
 496  * Must be called with dd->sc_lock held.
 497  */
 498 static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
 499                        u32 *hw_context)
 500 {
 501         struct send_context_info *sci;
 502         u32 index;
 503         u32 context;
 504
 505         for (index = 0, sci = &dd->send_contexts[0];
 506                         index < dd->num_send_contexts; index++, sci++) {
 507                 if (sci->type == type && sci->allocated == 0) {
 508                         sci->allocated = 1;
 509                         /* use a 1:1 mapping, but make them non-equal */
 510                         context = dd->chip_send_contexts - index - 1;
 511                         dd->hw_to_sw[context] = index;
 512                         *sw_index = index;
 513                         *hw_context = context;
 514                         return 0; /* success */
 515                 }
 516         }
 517         dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
 518         return -ENOSPC;
 519 }
 520
 521 /*
 522  * Free the send context given by its software index.
 523  *
 524  * Must be called with dd->sc_lock held.
 525  */
 526 static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
 527 {
 528         struct send_context_info *sci;
 529
 530         sci = &dd->send_contexts[sw_index];
 531         if (!sci->allocated) {
 532                 dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
 533                            __func__, sw_index, hw_context);
 534         }
 535         sci->allocated = 0;
 536         dd->hw_to_sw[hw_context] = INVALID_SCI;
 537 }
 538
 539 /* return the base context of a context in a group */
 540 static inline u32 group_context(u32 context, u32 group)
 541 {
 542         return (context >> group) << group;
 543 }
 544
 545 /* return the size of a group */
 546 static inline u32 group_size(u32 group)
 547 {
 548         return 1 << group;
 549 }
 550
 551 /*
 552  * Obtain the credit return addresses, kernel virtual and bus, for the
 553  * given sc.
 554  *
 555  * To understand this routine:
 556  * o va and dma are arrays of struct credit_return.  One for each physical
 557  *   send context, per NUMA.
 558  * o Each send context always looks in its relative location in a struct
 559  *   credit_return for its credit return.
 560  * o Each send context in a group must have its return address CSR programmed
 561  *   with the same value.  Use the address of the first send context in the
 562  *   group.
 563  */
 564 static void cr_group_addresses(struct send_context *sc, dma_addr_t *dma)
 565 {
 566         u32 gc = group_context(sc->hw_context, sc->group);
 567         u32 index = sc->hw_context & 0x7;
 568
 569         sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
 570         *dma = (unsigned long)
 571                &((struct credit_return *)sc->dd->cr_base[sc->node].dma)[gc];
 572 }
 573
 574 /*
 575  * Work queue function triggered in error interrupt routine for
 576  * kernel contexts.
 577  */
 578 static void sc_halted(struct work_struct *work)
 579 {
 580         struct send_context *sc;
 581
 582         sc = container_of(work, struct send_context, halt_work);
 583         sc_restart(sc);
 584 }
 585
 586 /*
 587  * Calculate PIO block threshold for this send context using the given MTU.
 588  * Trigger a return when one MTU plus optional header of credits remain.
 589  *
 590  * Parameter mtu is in bytes.
 591  * Parameter hdrqentsize is in DWORDs.
 592  *
 593  * Return value is what to write into the CSR: trigger return when
 594  * unreturned credits pass this count.
 595  */
 596 u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
 597 {
 598         u32 release_credits;
 599         u32 threshold;
 600
 601         /* add in the header size, then divide by the PIO block size */
 602         mtu += hdrqentsize << 2;
 603         release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
 604
 605         /* check against this context's credits */
 606         if (sc->credits <= release_credits)
 607                 threshold = 1;
 608         else
 609                 threshold = sc->credits - release_credits;
 610
 611         return threshold;
 612 }
 613
 614 /*
 615  * Calculate credit threshold in terms of percent of the allocated credits.
 616  * Trigger when unreturned credits equal or exceed the percentage of the whole.
 617  *
 618  * Return value is what to write into the CSR: trigger return when
 619  * unreturned credits pass this count.
 620  */
 621 u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
 622 {
 623         return (sc->credits * percent) / 100;
 624 }
 625
 626 /*
 627  * Set the credit return threshold.
 628  */
 629 void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
 630 {
 631         unsigned long flags;
 632         u32 old_threshold;
 633         int force_return = 0;
 634
 635         spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
 636
 637         old_threshold = (sc->credit_ctrl >>
 638                                 SC(CREDIT_CTRL_THRESHOLD_SHIFT))
 639                          & SC(CREDIT_CTRL_THRESHOLD_MASK);
 640
 641         if (new_threshold != old_threshold) {
 642                 sc->credit_ctrl =
 643                         (sc->credit_ctrl
 644                                 & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
 645                         | ((new_threshold
 646                                 & SC(CREDIT_CTRL_THRESHOLD_MASK))
 647                            << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
 648                 write_kctxt_csr(sc->dd, sc->hw_context,
 649                                 SC(CREDIT_CTRL), sc->credit_ctrl);
 650
 651                 /* force a credit return on change to avoid a possible stall */
 652                 force_return = 1;
 653         }
 654
 655         spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
 656
 657         if (force_return)
 658                 sc_return_credits(sc);
 659 }
 660
 661 /*
 662  * set_pio_integrity
 663  *
 664  * Set the CHECK_ENABLE register for the send context 'sc'.
 665  */
 666 void set_pio_integrity(struct send_context *sc)
 667 {
 668         struct hfi1_devdata *dd = sc->dd;
 669         u32 hw_context = sc->hw_context;
 670         int type = sc->type;
 671
 672         write_kctxt_csr(dd, hw_context,
 673                         SC(CHECK_ENABLE),
 674                         hfi1_pkt_default_send_ctxt_mask(dd, type));
 675 }
 676
 677 static u32 get_buffers_allocated(struct send_context *sc)
 678 {
 679         int cpu;
 680         u32 ret = 0;
 681
 682         for_each_possible_cpu(cpu)
 683                 ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
 684         return ret;
 685 }
 686
 687 static void reset_buffers_allocated(struct send_context *sc)
 688 {
 689         int cpu;
 690
 691         for_each_possible_cpu(cpu)
 692                 (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
 693 }
 694
 695 /*
 696  * Allocate a NUMA relative send context structure of the given type along
 697  * with a HW context.
 698  */
 699 struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 700                               uint hdrqentsize, int numa)
 701 {
 702         struct send_context_info *sci;
 703         struct send_context *sc = NULL;
 704         dma_addr_t dma;
 705         unsigned long flags;
 706         u64 reg;
 707         u32 thresh;
 708         u32 sw_index;
 709         u32 hw_context;
 710         int ret;
 711         u8 opval, opmask;
 712
 713         /* do not allocate while frozen */
 714         if (dd->flags & HFI1_FROZEN)
 715                 return NULL;
 716
 717         sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
 718         if (!sc)
 719                 return NULL;
 720
 721         sc->buffers_allocated = alloc_percpu(u32);
 722         if (!sc->buffers_allocated) {
 723                 kfree(sc);
 724                 dd_dev_err(dd,
 725                            "Cannot allocate buffers_allocated per cpu counters\n"
 726                           );
 727                 return NULL;
 728         }
 729
 730         spin_lock_irqsave(&dd->sc_lock, flags);
 731         ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
 732         if (ret) {
 733                 spin_unlock_irqrestore(&dd->sc_lock, flags);
 734                 free_percpu(sc->buffers_allocated);
 735                 kfree(sc);
 736                 return NULL;
 737         }
 738
 739         sci = &dd->send_contexts[sw_index];
 740         sci->sc = sc;
 741
 742         sc->dd = dd;
 743         sc->node = numa;
 744         sc->type = type;
 745         spin_lock_init(&sc->alloc_lock);
 746         spin_lock_init(&sc->release_lock);
 747         spin_lock_init(&sc->credit_ctrl_lock);
 748         INIT_LIST_HEAD(&sc->piowait);
 749         INIT_WORK(&sc->halt_work, sc_halted);
 750         init_waitqueue_head(&sc->halt_wait);
 751
 752         /* grouping is always single context for now */
 753         sc->group = 0;
 754
 755         sc->sw_index = sw_index;
 756         sc->hw_context = hw_context;
 757         cr_group_addresses(sc, &dma);
 758         sc->credits = sci->credits;
 759         sc->size = sc->credits * PIO_BLOCK_SIZE;
 760
 761 /* PIO Send Memory Address details */
 762 #define PIO_ADDR_CONTEXT_MASK 0xfful
 763 #define PIO_ADDR_CONTEXT_SHIFT 16
 764         sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
 765                                         << PIO_ADDR_CONTEXT_SHIFT);
 766
 767         /* set base and credits */
 768         reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
 769                                         << SC(CTRL_CTXT_DEPTH_SHIFT))
 770                 | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
 771                                         << SC(CTRL_CTXT_BASE_SHIFT));
 772         write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
 773
 774         set_pio_integrity(sc);
 775
 776         /* unmask all errors */
 777         write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
 778
 779         /* set the default partition key */
 780         write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
 781                         (SC(CHECK_PARTITION_KEY_VALUE_MASK) &
 782                          DEFAULT_PKEY) <<
 783                         SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
 784
 785         /* per context type checks */
 786         if (type == SC_USER) {
 787                 opval = USER_OPCODE_CHECK_VAL;
 788                 opmask = USER_OPCODE_CHECK_MASK;
 789         } else {
 790                 opval = OPCODE_CHECK_VAL_DISABLED;
 791                 opmask = OPCODE_CHECK_MASK_DISABLED;
 792         }
 793
 794         /* set the send context check opcode mask and value */
 795         write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
 796                         ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
 797                         ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
 798
 799         /* set up credit return */
 800         reg = dma & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
 801         write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
 802
 803         /*
 804          * Calculate the initial credit return threshold.
 805          *
 806          * For Ack contexts, set a threshold for half the credits.
 807          * For User contexts use the given percentage.  This has been
 808          * sanitized on driver start-up.
 809          * For Kernel contexts, use the default MTU plus a header
 810          * or half the credits, whichever is smaller. This should
 811          * work for both the 3-deep buffering allocation and the
 812          * pooling allocation.
 813          */
 814         if (type == SC_ACK) {
 815                 thresh = sc_percent_to_threshold(sc, 50);
 816         } else if (type == SC_USER) {
 817                 thresh = sc_percent_to_threshold(sc,
 818                                                  user_credit_return_threshold);
 819         } else { /* kernel */
 820                 thresh = min(sc_percent_to_threshold(sc, 50),
 821                              sc_mtu_to_threshold(sc, hfi1_max_mtu,
 822                                                  hdrqentsize));
 823         }
 824         reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
 825         /* add in early return */
 826         if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
 827                 reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
 828         else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
 829                 reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
 830
 831         /* set up write-through credit_ctrl */
 832         sc->credit_ctrl = reg;
 833         write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
 834
 835         /* User send contexts should not allow sending on VL15 */
 836         if (type == SC_USER) {
 837                 reg = 1ULL << 15;
 838                 write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
 839         }
 840
 841         spin_unlock_irqrestore(&dd->sc_lock, flags);
 842
 843         /*
 844          * Allocate shadow ring to track outstanding PIO buffers _after_
 845          * unlocking.  We don't know the size until the lock is held and
 846          * we can't allocate while the lock is held.  No one is using
 847          * the context yet, so allocate it now.
 848          *
 849          * User contexts do not get a shadow ring.
 850          */
 851         if (type != SC_USER) {
 852                 /*
 853                  * Size the shadow ring 1 larger than the number of credits
 854                  * so head == tail can mean empty.
 855                  */
 856                 sc->sr_size = sci->credits + 1;
 857                 sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
 858                                 sc->sr_size, GFP_KERNEL, numa);
 859                 if (!sc->sr) {
 860                         sc_free(sc);
 861                         return NULL;
 862                 }
 863         }
 864
 865         hfi1_cdbg(PIO,
 866                   "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
 867                   sw_index,
 868                   hw_context,
 869                   sc_type_name(type),
 870                   sc->group,
 871                   sc->credits,
 872                   sc->credit_ctrl,
 873                   thresh);
 874
 875         return sc;
 876 }
 877
 878 /* free a per-NUMA send context structure */
 879 void sc_free(struct send_context *sc)
 880 {
 881         struct hfi1_devdata *dd;
 882         unsigned long flags;
 883         u32 sw_index;
 884         u32 hw_context;
 885
 886         if (!sc)
 887                 return;
 888
 889         sc->flags |= SCF_IN_FREE;       /* ensure no restarts */
 890         dd = sc->dd;
 891         if (!list_empty(&sc->piowait))
 892                 dd_dev_err(dd, "piowait list not empty!\n");
 893         sw_index = sc->sw_index;
 894         hw_context = sc->hw_context;
 895         sc_disable(sc); /* make sure the HW is disabled */
 896         flush_work(&sc->halt_work);
 897
 898         spin_lock_irqsave(&dd->sc_lock, flags);
 899         dd->send_contexts[sw_index].sc = NULL;
 900
 901         /* clear/disable all registers set in sc_alloc */
 902         write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
 903         write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
 904         write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
 905         write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
 906         write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
 907         write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
 908         write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
 909
 910         /* release the index and context for re-use */
 911         sc_hw_free(dd, sw_index, hw_context);
 912         spin_unlock_irqrestore(&dd->sc_lock, flags);
 913
 914         kfree(sc->sr);
 915         free_percpu(sc->buffers_allocated);
 916         kfree(sc);
 917 }
 918
 919 /* disable the context */
 920 void sc_disable(struct send_context *sc)
 921 {
 922         u64 reg;
 923         unsigned long flags;
 924         struct pio_buf *pbuf;
 925
 926         if (!sc)
 927                 return;
 928
 929         /* do all steps, even if already disabled */
 930         spin_lock_irqsave(&sc->alloc_lock, flags);
 931         reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
 932         reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
 933         sc->flags &= ~SCF_ENABLED;
 934         sc_wait_for_packet_egress(sc, 1);
 935         write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
 936         spin_unlock_irqrestore(&sc->alloc_lock, flags);
 937
 938         /*
 939          * Flush any waiters.  Once the context is disabled,
 940          * credit return interrupts are stopped (although there
 941          * could be one in-process when the context is disabled).
 942          * Wait one microsecond for any lingering interrupts, then
 943          * proceed with the flush.
 944          */
 945         udelay(1);
 946         spin_lock_irqsave(&sc->release_lock, flags);
 947         if (sc->sr) {   /* this context has a shadow ring */
 948                 while (sc->sr_tail != sc->sr_head) {
 949                         pbuf = &sc->sr[sc->sr_tail].pbuf;
 950                         if (pbuf->cb)
 951                                 (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
 952                         sc->sr_tail++;
 953                         if (sc->sr_tail >= sc->sr_size)
 954                                 sc->sr_tail = 0;
 955                 }
 956         }
 957         spin_unlock_irqrestore(&sc->release_lock, flags);
 958 }
 959
 960 /* return SendEgressCtxtStatus.PacketOccupancy */
 961 static u64 packet_occupancy(u64 reg)
 962 {
 963         return (reg &
 964                 SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)
 965                 >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT;
 966 }
 967
 968 /* is egress halted on the context? */
 969 static bool egress_halted(u64 reg)
 970 {
 971         return !!(reg & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK);
 972 }
 973
 974 /* is the send context halted? */
 975 static bool is_sc_halted(struct hfi1_devdata *dd, u32 hw_context)
 976 {
 977         return !!(read_kctxt_csr(dd, hw_context, SC(STATUS)) &
 978                   SC(STATUS_CTXT_HALTED_SMASK));
 979 }
 980
 981 /**
 982  * sc_wait_for_packet_egress
 983  * @sc: valid send context
 984  * @pause: wait for credit return
 985  *
 986  * Wait for packet egress, optionally pause for credit return
 987  *
 988  * Egress halt and Context halt are not necessarily the same thing, so
 989  * check for both.
 990  *
 991  * NOTE: The context halt bit may not be set immediately.  Because of this,
 992  * it is necessary to check the SW SFC_HALTED bit (set in the IRQ) and the HW
 993  * context bit to determine if the context is halted.
 994  */
 995 static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
 996 {
 997         struct hfi1_devdata *dd = sc->dd;
 998         u64 reg = 0;
 999         u64 reg_prev;
1000         u32 loop = 0;
1001
1002         while (1) {
1003                 reg_prev = reg;
1004                 reg = read_csr(dd, sc->hw_context * 8 +
1005                                SEND_EGRESS_CTXT_STATUS);
1006                 /* done if any halt bits, SW or HW are set */
1007                 if (sc->flags & SCF_HALTED ||
1008                     is_sc_halted(dd, sc->hw_context) || egress_halted(reg))
1009                         break;
1010                 reg = packet_occupancy(reg);
1011                 if (reg == 0)
1012                         break;
1013                 /* counter is reset if occupancy count changes */
1014                 if (reg != reg_prev)
1015                         loop = 0;
1016                 if (loop > 50000) {
1017                         /* timed out - bounce the link */
1018                         dd_dev_err(dd,
1019                                    "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
1020                                    __func__, sc->sw_index,
1021                                    sc->hw_context, (u32)reg);
1022                         queue_work(dd->pport->link_wq,
1023                                    &dd->pport->link_bounce_work);
1024                         break;
1025                 }
1026                 loop++;
1027                 udelay(1);
1028         }
1029
1030         if (pause)
1031                 /* Add additional delay to ensure chip returns all credits */
1032                 pause_for_credit_return(dd);
1033 }
1034
1035 void sc_wait(struct hfi1_devdata *dd)
1036 {
1037         int i;
1038
1039         for (i = 0; i < dd->num_send_contexts; i++) {
1040                 struct send_context *sc = dd->send_contexts[i].sc;
1041
1042                 if (!sc)
1043                         continue;
1044                 sc_wait_for_packet_egress(sc, 0);
1045         }
1046 }
1047
1048 /*
1049  * Restart a context after it has been halted due to error.
1050  *
1051  * If the first step fails - wait for the halt to be asserted, return early.
1052  * Otherwise complain about timeouts but keep going.
1053  *
1054  * It is expected that allocations (enabled flag bit) have been shut off
1055  * already (only applies to kernel contexts).
1056  */
1057 int sc_restart(struct send_context *sc)
1058 {
1059         struct hfi1_devdata *dd = sc->dd;
1060         u64 reg;
1061         u32 loop;
1062         int count;
1063
1064         /* bounce off if not halted, or being free'd */
1065         if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
1066                 return -EINVAL;
1067
1068         dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
1069                     sc->hw_context);
1070
1071         /*
1072          * Step 1: Wait for the context to actually halt.
1073          *
1074          * The error interrupt is asynchronous to actually setting halt
1075          * on the context.
1076          */
1077         loop = 0;
1078         while (1) {
1079                 reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
1080                 if (reg & SC(STATUS_CTXT_HALTED_SMASK))
1081                         break;
1082                 if (loop > 100) {
1083                         dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
1084                                    __func__, sc->sw_index, sc->hw_context);
1085                         return -ETIME;
1086                 }
1087                 loop++;
1088                 udelay(1);
1089         }
1090
1091         /*
1092          * Step 2: Ensure no users are still trying to write to PIO.
1093          *
1094          * For kernel contexts, we have already turned off buffer allocation.
1095          * Now wait for the buffer count to go to zero.
1096          *
1097          * For user contexts, the user handling code has cut off write access
1098          * to the context's PIO pages before calling this routine and will
1099          * restore write access after this routine returns.
1100          */
1101         if (sc->type != SC_USER) {
1102                 /* kernel context */
1103                 loop = 0;
1104                 while (1) {
1105                         count = get_buffers_allocated(sc);
1106                         if (count == 0)
1107                                 break;
1108                         if (loop > 100) {
1109                                 dd_dev_err(dd,
1110                                            "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
1111                                            __func__, sc->sw_index,
1112                                            sc->hw_context, count);
1113                         }
1114                         loop++;
1115                         udelay(1);
1116                 }
1117         }
1118
1119         /*
1120          * Step 3: Wait for all packets to egress.
1121          * This is done while disabling the send context
1122          *
1123          * Step 4: Disable the context
1124          *
1125          * This is a superset of the halt.  After the disable, the
1126          * errors can be cleared.
1127          */
1128         sc_disable(sc);
1129
1130         /*
1131          * Step 5: Enable the context
1132          *
1133          * This enable will clear the halted flag and per-send context
1134          * error flags.
1135          */
1136         return sc_enable(sc);
1137 }
1138
1139 /*
1140  * PIO freeze processing.  To be called after the TXE block is fully frozen.
1141  * Go through all frozen send contexts and disable them.  The contexts are
1142  * already stopped by the freeze.
1143  */
1144 void pio_freeze(struct hfi1_devdata *dd)
1145 {
1146         struct send_context *sc;
1147         int i;
1148
1149         for (i = 0; i < dd->num_send_contexts; i++) {
1150                 sc = dd->send_contexts[i].sc;
1151                 /*
1152                  * Don't disable unallocated, unfrozen, or user send contexts.
1153                  * User send contexts will be disabled when the process
1154                  * calls into the driver to reset its context.
1155                  */
1156                 if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
1157                         continue;
1158
1159                 /* only need to disable, the context is already stopped */
1160                 sc_disable(sc);
1161         }
1162 }
1163
1164 /*
1165  * Unfreeze PIO for kernel send contexts.  The precondition for calling this
1166  * is that all PIO send contexts have been disabled and the SPC freeze has
1167  * been cleared.  Now perform the last step and re-enable each kernel context.
1168  * User (PSM) processing will occur when PSM calls into the kernel to
1169  * acknowledge the freeze.
1170  */
1171 void pio_kernel_unfreeze(struct hfi1_devdata *dd)
1172 {
1173         struct send_context *sc;
1174         int i;
1175
1176         for (i = 0; i < dd->num_send_contexts; i++) {
1177                 sc = dd->send_contexts[i].sc;
1178                 if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
1179                         continue;
1180
1181                 sc_enable(sc);  /* will clear the sc frozen flag */
1182         }
1183 }
1184
1185 /*
1186  * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
1187  * Returns:
1188  *      -ETIMEDOUT - if we wait too long
1189  *      -EIO       - if there was an error
1190  */
1191 static int pio_init_wait_progress(struct hfi1_devdata *dd)
1192 {
1193         u64 reg;
1194         int max, count = 0;
1195
1196         /* max is the longest possible HW init time / delay */
1197         max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
1198         while (1) {
1199                 reg = read_csr(dd, SEND_PIO_INIT_CTXT);
1200                 if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
1201                         break;
1202                 if (count >= max)
1203                         return -ETIMEDOUT;
1204                 udelay(5);
1205                 count++;
1206         }
1207
1208         return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
1209 }
1210
1211 /*
1212  * Reset all of the send contexts to their power-on state.  Used
1213  * only during manual init - no lock against sc_enable needed.
1214  */
1215 void pio_reset_all(struct hfi1_devdata *dd)
1216 {
1217         int ret;
1218
1219         /* make sure the init engine is not busy */
1220         ret = pio_init_wait_progress(dd);
1221         /* ignore any timeout */
1222         if (ret == -EIO) {
1223                 /* clear the error */
1224                 write_csr(dd, SEND_PIO_ERR_CLEAR,
1225                           SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
1226         }
1227
1228         /* reset init all */
1229         write_csr(dd, SEND_PIO_INIT_CTXT,
1230                   SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
1231         udelay(2);
1232         ret = pio_init_wait_progress(dd);
1233         if (ret < 0) {
1234                 dd_dev_err(dd,
1235                            "PIO send context init %s while initializing all PIO blocks\n",
1236                            ret == -ETIMEDOUT ? "is stuck" : "had an error");
1237         }
1238 }
1239
1240 /* enable the context */
1241 int sc_enable(struct send_context *sc)
1242 {
1243         u64 sc_ctrl, reg, pio;
1244         struct hfi1_devdata *dd;
1245         unsigned long flags;
1246         int ret = 0;
1247
1248         if (!sc)
1249                 return -EINVAL;
1250         dd = sc->dd;
1251
1252         /*
1253          * Obtain the allocator lock to guard against any allocation
1254          * attempts (which should not happen prior to context being
1255          * enabled). On the release/disable side we don't need to
1256          * worry about locking since the releaser will not do anything
1257          * if the context accounting values have not changed.
1258          */
1259         spin_lock_irqsave(&sc->alloc_lock, flags);
1260         sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
1261         if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
1262                 goto unlock; /* already enabled */
1263
1264         /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
1265
1266         *sc->hw_free = 0;
1267         sc->free = 0;
1268         sc->alloc_free = 0;
1269         sc->fill = 0;
1270         sc->fill_wrap = 0;
1271         sc->sr_head = 0;
1272         sc->sr_tail = 0;
1273         sc->flags = 0;
1274         /* the alloc lock insures no fast path allocation */
1275         reset_buffers_allocated(sc);
1276
1277         /*
1278          * Clear all per-context errors.  Some of these will be set when
1279          * we are re-enabling after a context halt.  Now that the context
1280          * is disabled, the halt will not clear until after the PIO init
1281          * engine runs below.
1282          */
1283         reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
1284         if (reg)
1285                 write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
1286
1287         /*
1288          * The HW PIO initialization engine can handle only one init
1289          * request at a time. Serialize access to each device's engine.
1290          */
1291         spin_lock(&dd->sc_init_lock);
1292         /*
1293          * Since access to this code block is serialized and
1294          * each access waits for the initialization to complete
1295          * before releasing the lock, the PIO initialization engine
1296          * should not be in use, so we don't have to wait for the
1297          * InProgress bit to go down.
1298          */
1299         pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
1300                SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
1301                 SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
1302         write_csr(dd, SEND_PIO_INIT_CTXT, pio);
1303         /*
1304          * Wait until the engine is done.  Give the chip the required time
1305          * so, hopefully, we read the register just once.
1306          */
1307         udelay(2);
1308         ret = pio_init_wait_progress(dd);
1309         spin_unlock(&dd->sc_init_lock);
1310         if (ret) {
1311                 dd_dev_err(dd,
1312                            "sctxt%u(%u): Context not enabled due to init failure %d\n",
1313                            sc->sw_index, sc->hw_context, ret);
1314                 goto unlock;
1315         }
1316
1317         /*
1318          * All is well. Enable the context.
1319          */
1320         sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
1321         write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
1322         /*
1323          * Read SendCtxtCtrl to force the write out and prevent a timing
1324          * hazard where a PIO write may reach the context before the enable.
1325          */
1326         read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
1327         sc->flags |= SCF_ENABLED;
1328
1329 unlock:
1330         spin_unlock_irqrestore(&sc->alloc_lock, flags);
1331
1332         return ret;
1333 }
1334
1335 /* force a credit return on the context */
1336 void sc_return_credits(struct send_context *sc)
1337 {
1338         if (!sc)
1339                 return;
1340
1341         /* a 0->1 transition schedules a credit return */
1342         write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
1343                         SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
1344         /*
1345          * Ensure that the write is flushed and the credit return is
1346          * scheduled. We care more about the 0 -> 1 transition.
1347          */
1348         read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
1349         /* set back to 0 for next time */
1350         write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
1351 }
1352
1353 /* allow all in-flight packets to drain on the context */
1354 void sc_flush(struct send_context *sc)
1355 {
1356         if (!sc)
1357                 return;
1358
1359         sc_wait_for_packet_egress(sc, 1);
1360 }
1361
1362 /* drop all packets on the context, no waiting until they are sent */
1363 void sc_drop(struct send_context *sc)
1364 {
1365         if (!sc)
1366                 return;
1367
1368         dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
1369                     __func__, sc->sw_index, sc->hw_context);
1370 }
1371
1372 /*
1373  * Start the software reaction to a context halt or SPC freeze:
1374  *      - mark the context as halted or frozen
1375  *      - stop buffer allocations
1376  *
1377  * Called from the error interrupt.  Other work is deferred until
1378  * out of the interrupt.
1379  */
1380 void sc_stop(struct send_context *sc, int flag)
1381 {
1382         unsigned long flags;
1383
1384         /* mark the context */
1385         sc->flags |= flag;
1386
1387         /* stop buffer allocations */
1388         spin_lock_irqsave(&sc->alloc_lock, flags);
1389         sc->flags &= ~SCF_ENABLED;
1390         spin_unlock_irqrestore(&sc->alloc_lock, flags);
1391         wake_up(&sc->halt_wait);
1392 }
1393
1394 #define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
1395 #define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
1396
1397 /*
1398  * The send context buffer "allocator".
1399  *
1400  * @sc: the PIO send context we are allocating from
1401  * @len: length of whole packet - including PBC - in dwords
1402  * @cb: optional callback to call when the buffer is finished sending
1403  * @arg: argument for cb
1404  *
1405  * Return a pointer to a PIO buffer if successful, NULL if not enough room.
1406  */
1407 struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
1408                                 pio_release_cb cb, void *arg)
1409 {
1410         struct pio_buf *pbuf = NULL;
1411         unsigned long flags;
1412         unsigned long avail;
1413         unsigned long blocks = dwords_to_blocks(dw_len);
1414         u32 fill_wrap;
1415         int trycount = 0;
1416         u32 head, next;
1417
1418         spin_lock_irqsave(&sc->alloc_lock, flags);
1419         if (!(sc->flags & SCF_ENABLED)) {
1420                 spin_unlock_irqrestore(&sc->alloc_lock, flags);
1421                 goto done;
1422         }
1423
1424 retry:
1425         avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
1426         if (blocks > avail) {
1427                 /* not enough room */
1428                 if (unlikely(trycount)) { /* already tried to get more room */
1429                         spin_unlock_irqrestore(&sc->alloc_lock, flags);
1430                         goto done;
1431                 }
1432                 /* copy from receiver cache line and recalculate */
1433                 sc->alloc_free = READ_ONCE(sc->free);
1434                 avail =
1435                         (unsigned long)sc->credits -
1436                         (sc->fill - sc->alloc_free);
1437                 if (blocks > avail) {
1438                         /* still no room, actively update */
1439                         sc_release_update(sc);
1440                         sc->alloc_free = READ_ONCE(sc->free);
1441                         trycount++;
1442                         goto retry;
1443                 }
1444         }
1445
1446         /* there is enough room */
1447
1448         preempt_disable();
1449         this_cpu_inc(*sc->buffers_allocated);
1450
1451         /* read this once */
1452         head = sc->sr_head;
1453
1454         /* "allocate" the buffer */
1455         sc->fill += blocks;
1456         fill_wrap = sc->fill_wrap;
1457         sc->fill_wrap += blocks;
1458         if (sc->fill_wrap >= sc->credits)
1459                 sc->fill_wrap = sc->fill_wrap - sc->credits;
1460
1461         /*
1462          * Fill the parts that the releaser looks at before moving the head.
1463          * The only necessary piece is the sent_at field.  The credits
1464          * we have just allocated cannot have been returned yet, so the
1465          * cb and arg will not be looked at for a "while".  Put them
1466          * on this side of the memory barrier anyway.
1467          */
1468         pbuf = &sc->sr[head].pbuf;
1469         pbuf->sent_at = sc->fill;
1470         pbuf->cb = cb;
1471         pbuf->arg = arg;
1472         pbuf->sc = sc;  /* could be filled in at sc->sr init time */
1473         /* make sure this is in memory before updating the head */
1474
1475         /* calculate next head index, do not store */
1476         next = head + 1;
1477         if (next >= sc->sr_size)
1478                 next = 0;
1479         /*
1480          * update the head - must be last! - the releaser can look at fields
1481          * in pbuf once we move the head
1482          */
1483         smp_wmb();
1484         sc->sr_head = next;
1485         spin_unlock_irqrestore(&sc->alloc_lock, flags);
1486
1487         /* finish filling in the buffer outside the lock */
1488         pbuf->start = sc->base_addr + fill_wrap * PIO_BLOCK_SIZE;
1489         pbuf->end = sc->base_addr + sc->size;
1490         pbuf->qw_written = 0;
1491         pbuf->carry_bytes = 0;
1492         pbuf->carry.val64 = 0;
1493 done:
1494         return pbuf;
1495 }
1496
1497 /*
1498  * There are at least two entities that can turn on credit return
1499  * interrupts and they can overlap.  Avoid problems by implementing
1500  * a count scheme that is enforced by a lock.  The lock is needed because
1501  * the count and CSR write must be paired.
1502  */
1503
1504 /*
1505  * Start credit return interrupts.  This is managed by a count.  If already
1506  * on, just increment the count.
1507  */
1508 void sc_add_credit_return_intr(struct send_context *sc)
1509 {
1510         unsigned long flags;
1511
1512         /* lock must surround both the count change and the CSR update */
1513         spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
1514         if (sc->credit_intr_count == 0) {
1515                 sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
1516                 write_kctxt_csr(sc->dd, sc->hw_context,
1517                                 SC(CREDIT_CTRL), sc->credit_ctrl);
1518         }
1519         sc->credit_intr_count++;
1520         spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
1521 }
1522
1523 /*
1524  * Stop credit return interrupts.  This is managed by a count.  Decrement the
1525  * count, if the last user, then turn the credit interrupts off.
1526  */
1527 void sc_del_credit_return_intr(struct send_context *sc)
1528 {
1529         unsigned long flags;
1530
1531         WARN_ON(sc->credit_intr_count == 0);
1532
1533         /* lock must surround both the count change and the CSR update */
1534         spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
1535         sc->credit_intr_count--;
1536         if (sc->credit_intr_count == 0) {
1537                 sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
1538                 write_kctxt_csr(sc->dd, sc->hw_context,
1539                                 SC(CREDIT_CTRL), sc->credit_ctrl);
1540         }
1541         spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
1542 }
1543
1544 /*
1545  * The caller must be careful when calling this.  All needint calls
1546  * must be paired with !needint.
1547  */
1548 void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
1549 {
1550         if (needint)
1551                 sc_add_credit_return_intr(sc);
1552         else
1553                 sc_del_credit_return_intr(sc);
1554         trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
1555         if (needint) {
1556                 mmiowb();
1557                 sc_return_credits(sc);
1558         }
1559 }
1560
1561 /**
1562  * sc_piobufavail - callback when a PIO buffer is available
1563  * @sc: the send context
1564  *
1565  * This is called from the interrupt handler when a PIO buffer is
1566  * available after hfi1_verbs_send() returned an error that no buffers were
1567  * available. Disable the interrupt if there are no more QPs waiting.
1568  */
1569 static void sc_piobufavail(struct send_context *sc)
1570 {
1571         struct hfi1_devdata *dd = sc->dd;
1572         struct hfi1_ibdev *dev = &dd->verbs_dev;
1573         struct list_head *list;
1574         struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
1575         struct rvt_qp *qp;
1576         struct hfi1_qp_priv *priv;
1577         unsigned long flags;
1578         uint i, n = 0, max_idx = 0;
1579         u8 max_starved_cnt = 0;
1580
1581         if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
1582             dd->send_contexts[sc->sw_index].type != SC_VL15)
1583                 return;
1584         list = &sc->piowait;
1585         /*
1586          * Note: checking that the piowait list is empty and clearing
1587          * the buffer available interrupt needs to be atomic or we
1588          * could end up with QPs on the wait list with the interrupt
1589          * disabled.
1590          */
1591         write_seqlock_irqsave(&dev->iowait_lock, flags);
1592         while (!list_empty(list)) {
1593                 struct iowait *wait;
1594
1595                 if (n == ARRAY_SIZE(qps))
1596                         break;
1597                 wait = list_first_entry(list, struct iowait, list);
1598                 qp = iowait_to_qp(wait);
1599                 priv = qp->priv;
1600                 list_del_init(&priv->s_iowait.list);
1601                 priv->s_iowait.lock = NULL;
1602                 iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
1603                 /* refcount held until actual wake up */
1604                 qps[n++] = qp;
1605         }
1606         /*
1607          * If there had been waiters and there are more
1608          * insure that we redo the force to avoid a potential hang.
1609          */
1610         if (n) {
1611                 hfi1_sc_wantpiobuf_intr(sc, 0);
1612                 if (!list_empty(list))
1613                         hfi1_sc_wantpiobuf_intr(sc, 1);
1614         }
1615         write_sequnlock_irqrestore(&dev->iowait_lock, flags);
1616
1617         /* Wake up the most starved one first */
1618         if (n)
1619                 hfi1_qp_wakeup(qps[max_idx],
1620                                RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
1621         for (i = 0; i < n; i++)
1622                 if (i != max_idx)
1623                         hfi1_qp_wakeup(qps[i],
1624                                        RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
1625 }
1626
1627 /* translate a send credit update to a bit code of reasons */
1628 static inline int fill_code(u64 hw_free)
1629 {
1630         int code = 0;
1631
1632         if (hw_free & CR_STATUS_SMASK)
1633                 code |= PRC_STATUS_ERR;
1634         if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
1635                 code |= PRC_PBC;
1636         if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
1637                 code |= PRC_THRESHOLD;
1638         if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
1639                 code |= PRC_FILL_ERR;
1640         if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
1641                 code |= PRC_SC_DISABLE;
1642         return code;
1643 }
1644
1645 /* use the jiffies compare to get the wrap right */
1646 #define sent_before(a, b) time_before(a, b)     /* a < b */
1647
1648 /*
1649  * The send context buffer "releaser".
1650  */
1651 void sc_release_update(struct send_context *sc)
1652 {
1653         struct pio_buf *pbuf;
1654         u64 hw_free;
1655         u32 head, tail;
1656         unsigned long old_free;
1657         unsigned long free;
1658         unsigned long extra;
1659         unsigned long flags;
1660         int code;
1661
1662         if (!sc)
1663                 return;
1664
1665         spin_lock_irqsave(&sc->release_lock, flags);
1666         /* update free */
1667         hw_free = le64_to_cpu(*sc->hw_free);            /* volatile read */
1668         old_free = sc->free;
1669         extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
1670                         - (old_free & CR_COUNTER_MASK))
1671                                 & CR_COUNTER_MASK;
1672         free = old_free + extra;
1673         trace_hfi1_piofree(sc, extra);
1674
1675         /* call sent buffer callbacks */
1676         code = -1;                              /* code not yet set */
1677         head = READ_ONCE(sc->sr_head);  /* snapshot the head */
1678         tail = sc->sr_tail;
1679         while (head != tail) {
1680                 pbuf = &sc->sr[tail].pbuf;
1681
1682                 if (sent_before(free, pbuf->sent_at)) {
1683                         /* not sent yet */
1684                         break;
1685                 }
1686                 if (pbuf->cb) {
1687                         if (code < 0) /* fill in code on first user */
1688                                 code = fill_code(hw_free);
1689                         (*pbuf->cb)(pbuf->arg, code);
1690                 }
1691
1692                 tail++;
1693                 if (tail >= sc->sr_size)
1694                         tail = 0;
1695         }
1696         sc->sr_tail = tail;
1697         /* make sure tail is updated before free */
1698         smp_wmb();
1699         sc->free = free;
1700         spin_unlock_irqrestore(&sc->release_lock, flags);
1701         sc_piobufavail(sc);
1702 }
1703
1704 /*
1705  * Send context group releaser.  Argument is the send context that caused
1706  * the interrupt.  Called from the send context interrupt handler.
1707  *
1708  * Call release on all contexts in the group.
1709  *
1710  * This routine takes the sc_lock without an irqsave because it is only
1711  * called from an interrupt handler.  Adjust if that changes.
1712  */
1713 void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
1714 {
1715         struct send_context *sc;
1716         u32 sw_index;
1717         u32 gc, gc_end;
1718
1719         spin_lock(&dd->sc_lock);
1720         sw_index = dd->hw_to_sw[hw_context];
1721         if (unlikely(sw_index >= dd->num_send_contexts)) {
1722                 dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
1723                            __func__, hw_context, sw_index);
1724                 goto done;
1725         }
1726         sc = dd->send_contexts[sw_index].sc;
1727         if (unlikely(!sc))
1728                 goto done;
1729
1730         gc = group_context(hw_context, sc->group);
1731         gc_end = gc + group_size(sc->group);
1732         for (; gc < gc_end; gc++) {
1733                 sw_index = dd->hw_to_sw[gc];
1734                 if (unlikely(sw_index >= dd->num_send_contexts)) {
1735                         dd_dev_err(dd,
1736                                    "%s: invalid hw (%u) to sw (%u) mapping\n",
1737                                    __func__, hw_context, sw_index);
1738                         continue;
1739                 }
1740                 sc_release_update(dd->send_contexts[sw_index].sc);
1741         }
1742 done:
1743         spin_unlock(&dd->sc_lock);
1744 }
1745
1746 /*
1747  * pio_select_send_context_vl() - select send context
1748  * @dd: devdata
1749  * @selector: a spreading factor
1750  * @vl: this vl
1751  *
1752  * This function returns a send context based on the selector and a vl.
1753  * The mapping fields are protected by RCU
1754  */
1755 struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
1756                                                 u32 selector, u8 vl)
1757 {
1758         struct pio_vl_map *m;
1759         struct pio_map_elem *e;
1760         struct send_context *rval;
1761
1762         /*
1763          * NOTE This should only happen if SC->VL changed after the initial
1764          * checks on the QP/AH
1765          * Default will return VL0's send context below
1766          */
1767         if (unlikely(vl >= num_vls)) {
1768                 rval = NULL;
1769                 goto done;
1770         }
1771
1772         rcu_read_lock();
1773         m = rcu_dereference(dd->pio_map);
1774         if (unlikely(!m)) {
1775                 rcu_read_unlock();
1776                 return dd->vld[0].sc;
1777         }
1778         e = m->map[vl & m->mask];
1779         rval = e->ksc[selector & e->mask];
1780         rcu_read_unlock();
1781
1782 done:
1783         rval = !rval ? dd->vld[0].sc : rval;
1784         return rval;
1785 }
1786
1787 /*
1788  * pio_select_send_context_sc() - select send context
1789  * @dd: devdata
1790  * @selector: a spreading factor
1791  * @sc5: the 5 bit sc
1792  *
1793  * This function returns an send context based on the selector and an sc
1794  */
1795 struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
1796                                                 u32 selector, u8 sc5)
1797 {
1798         u8 vl = sc_to_vlt(dd, sc5);
1799
1800         return pio_select_send_context_vl(dd, selector, vl);
1801 }
1802
1803 /*
1804  * Free the indicated map struct
1805  */
1806 static void pio_map_free(struct pio_vl_map *m)
1807 {
1808         int i;
1809
1810         for (i = 0; m && i < m->actual_vls; i++)
1811                 kfree(m->map[i]);
1812         kfree(m);
1813 }
1814
1815 /*
1816  * Handle RCU callback
1817  */
1818 static void pio_map_rcu_callback(struct rcu_head *list)
1819 {
1820         struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
1821
1822         pio_map_free(m);
1823 }
1824
1825 /*
1826  * Set credit return threshold for the kernel send context
1827  */
1828 static void set_threshold(struct hfi1_devdata *dd, int scontext, int i)
1829 {
1830         u32 thres;
1831
1832         thres = min(sc_percent_to_threshold(dd->kernel_send_context[scontext],
1833                                             50),
1834                     sc_mtu_to_threshold(dd->kernel_send_context[scontext],
1835                                         dd->vld[i].mtu,
1836                                         dd->rcd[0]->rcvhdrqentsize));
1837         sc_set_cr_threshold(dd->kernel_send_context[scontext], thres);
1838 }
1839
1840 /*
1841  * pio_map_init - called when #vls change
1842  * @dd: hfi1_devdata
1843  * @port: port number
1844  * @num_vls: number of vls
1845  * @vl_scontexts: per vl send context mapping (optional)
1846  *
1847  * This routine changes the mapping based on the number of vls.
1848  *
1849  * vl_scontexts is used to specify a non-uniform vl/send context
1850  * loading. NULL implies auto computing the loading and giving each
1851  * VL an uniform distribution of send contexts per VL.
1852  *
1853  * The auto algorithm computers the sc_per_vl and the number of extra
1854  * send contexts. Any extra send contexts are added from the last VL
1855  * on down
1856  *
1857  * rcu locking is used here to control access to the mapping fields.
1858  *
1859  * If either the num_vls or num_send_contexts are non-power of 2, the
1860  * array sizes in the struct pio_vl_map and the struct pio_map_elem are
1861  * rounded up to the next highest power of 2 and the first entry is
1862  * reused in a round robin fashion.
1863  *
1864  * If an error occurs the map change is not done and the mapping is not
1865  * chaged.
1866  *
1867  */
1868 int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
1869 {
1870         int i, j;
1871         int extra, sc_per_vl;
1872         int scontext = 1;
1873         int num_kernel_send_contexts = 0;
1874         u8 lvl_scontexts[OPA_MAX_VLS];
1875         struct pio_vl_map *oldmap, *newmap;
1876
1877         if (!vl_scontexts) {
1878                 for (i = 0; i < dd->num_send_contexts; i++)
1879                         if (dd->send_contexts[i].type == SC_KERNEL)
1880                                 num_kernel_send_contexts++;
1881                 /* truncate divide */
1882                 sc_per_vl = num_kernel_send_contexts / num_vls;
1883                 /* extras */
1884                 extra = num_kernel_send_contexts % num_vls;
1885                 vl_scontexts = lvl_scontexts;
1886                 /* add extras from last vl down */
1887                 for (i = num_vls - 1; i >= 0; i--, extra--)
1888                         vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
1889         }
1890         /* build new map */
1891         newmap = kzalloc(sizeof(*newmap) +
1892                          roundup_pow_of_two(num_vls) *
1893                          sizeof(struct pio_map_elem *),
1894                          GFP_KERNEL);
1895         if (!newmap)
1896                 goto bail;
1897         newmap->actual_vls = num_vls;
1898         newmap->vls = roundup_pow_of_two(num_vls);
1899         newmap->mask = (1 << ilog2(newmap->vls)) - 1;
1900         for (i = 0; i < newmap->vls; i++) {
1901                 /* save for wrap around */
1902                 int first_scontext = scontext;
1903
1904                 if (i < newmap->actual_vls) {
1905                         int sz = roundup_pow_of_two(vl_scontexts[i]);
1906
1907                         /* only allocate once */
1908                         newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
1909                                                  sz * sizeof(struct
1910                                                              send_context *),
1911                                                  GFP_KERNEL);
1912                         if (!newmap->map[i])
1913                                 goto bail;
1914                         newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
1915                         /*
1916                          * assign send contexts and
1917                          * adjust credit return threshold
1918                          */
1919                         for (j = 0; j < sz; j++) {
1920                                 if (dd->kernel_send_context[scontext]) {
1921                                         newmap->map[i]->ksc[j] =
1922                                         dd->kernel_send_context[scontext];
1923                                         set_threshold(dd, scontext, i);
1924                                 }
1925                                 if (++scontext >= first_scontext +
1926                                                   vl_scontexts[i])
1927                                         /* wrap back to first send context */
1928                                         scontext = first_scontext;
1929                         }
1930                 } else {
1931                         /* just re-use entry without allocating */
1932                         newmap->map[i] = newmap->map[i % num_vls];
1933                 }
1934                 scontext = first_scontext + vl_scontexts[i];
1935         }
1936         /* newmap in hand, save old map */
1937         spin_lock_irq(&dd->pio_map_lock);
1938         oldmap = rcu_dereference_protected(dd->pio_map,
1939                                            lockdep_is_held(&dd->pio_map_lock));
1940
1941         /* publish newmap */
1942         rcu_assign_pointer(dd->pio_map, newmap);
1943
1944         spin_unlock_irq(&dd->pio_map_lock);
1945         /* success, free any old map after grace period */
1946         if (oldmap)
1947                 call_rcu(&oldmap->list, pio_map_rcu_callback);
1948         return 0;
1949 bail:
1950         /* free any partial allocation */
1951         pio_map_free(newmap);
1952         return -ENOMEM;
1953 }
1954
1955 void free_pio_map(struct hfi1_devdata *dd)
1956 {
1957         /* Free PIO map if allocated */
1958         if (rcu_access_pointer(dd->pio_map)) {
1959                 spin_lock_irq(&dd->pio_map_lock);
1960                 pio_map_free(rcu_access_pointer(dd->pio_map));
1961                 RCU_INIT_POINTER(dd->pio_map, NULL);
1962                 spin_unlock_irq(&dd->pio_map_lock);
1963                 synchronize_rcu();
1964         }
1965         kfree(dd->kernel_send_context);
1966         dd->kernel_send_context = NULL;
1967 }
1968
1969 int init_pervl_scs(struct hfi1_devdata *dd)
1970 {
1971         int i;
1972         u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
1973         u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
1974         u32 ctxt;
1975         struct hfi1_pportdata *ppd = dd->pport;
1976
1977         dd->vld[15].sc = sc_alloc(dd, SC_VL15,
1978                                   dd->rcd[0]->rcvhdrqentsize, dd->node);
1979         if (!dd->vld[15].sc)
1980                 return -ENOMEM;
1981
1982         hfi1_init_ctxt(dd->vld[15].sc);
1983         dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
1984
1985         dd->kernel_send_context = kzalloc_node(dd->num_send_contexts *
1986                                         sizeof(struct send_context *),
1987                                         GFP_KERNEL, dd->node);
1988         if (!dd->kernel_send_context)
1989                 goto freesc15;
1990
1991         dd->kernel_send_context[0] = dd->vld[15].sc;
1992
1993         for (i = 0; i < num_vls; i++) {
1994                 /*
1995                  * Since this function does not deal with a specific
1996                  * receive context but we need the RcvHdrQ entry size,
1997                  * use the size from rcd[0]. It is guaranteed to be
1998                  * valid at this point and will remain the same for all
1999                  * receive contexts.
2000                  */
2001                 dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
2002                                          dd->rcd[0]->rcvhdrqentsize, dd->node);
2003                 if (!dd->vld[i].sc)
2004                         goto nomem;
2005                 dd->kernel_send_context[i + 1] = dd->vld[i].sc;
2006                 hfi1_init_ctxt(dd->vld[i].sc);
2007                 /* non VL15 start with the max MTU */
2008                 dd->vld[i].mtu = hfi1_max_mtu;
2009         }
2010         for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
2011                 dd->kernel_send_context[i + 1] =
2012                 sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
2013                 if (!dd->kernel_send_context[i + 1])
2014                         goto nomem;
2015                 hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
2016         }
2017
2018         sc_enable(dd->vld[15].sc);
2019         ctxt = dd->vld[15].sc->hw_context;
2020         mask = all_vl_mask & ~(1LL << 15);
2021         write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
2022         dd_dev_info(dd,
2023                     "Using send context %u(%u) for VL15\n",
2024                     dd->vld[15].sc->sw_index, ctxt);
2025
2026         for (i = 0; i < num_vls; i++) {
2027                 sc_enable(dd->vld[i].sc);
2028                 ctxt = dd->vld[i].sc->hw_context;
2029                 mask = all_vl_mask & ~(data_vls_mask);
2030                 write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
2031         }
2032         for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
2033                 sc_enable(dd->kernel_send_context[i + 1]);
2034                 ctxt = dd->kernel_send_context[i + 1]->hw_context;
2035                 mask = all_vl_mask & ~(data_vls_mask);
2036                 write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
2037         }
2038
2039         if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
2040                 goto nomem;
2041         return 0;
2042
2043 nomem:
2044         for (i = 0; i < num_vls; i++) {
2045                 sc_free(dd->vld[i].sc);
2046                 dd->vld[i].sc = NULL;
2047         }
2048
2049         for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
2050                 sc_free(dd->kernel_send_context[i + 1]);
2051
2052         kfree(dd->kernel_send_context);
2053         dd->kernel_send_context = NULL;
2054
2055 freesc15:
2056         sc_free(dd->vld[15].sc);
2057         return -ENOMEM;
2058 }
2059
2060 int init_credit_return(struct hfi1_devdata *dd)
2061 {
2062         int ret;
2063         int i;
2064
2065         dd->cr_base = kcalloc(
2066                 node_affinity.num_possible_nodes,
2067                 sizeof(struct credit_return_base),
2068                 GFP_KERNEL);
2069         if (!dd->cr_base) {
2070                 ret = -ENOMEM;
2071                 goto done;
2072         }
2073         for_each_node_with_cpus(i) {
2074                 int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
2075
2076                 set_dev_node(&dd->pcidev->dev, i);
2077                 dd->cr_base[i].va = dma_zalloc_coherent(
2078                                         &dd->pcidev->dev,
2079                                         bytes,
2080                                         &dd->cr_base[i].dma,
2081                                         GFP_KERNEL);
2082                 if (!dd->cr_base[i].va) {
2083                         set_dev_node(&dd->pcidev->dev, dd->node);
2084                         dd_dev_err(dd,
2085                                    "Unable to allocate credit return DMA range for NUMA %d\n",
2086                                    i);
2087                         ret = -ENOMEM;
2088                         goto done;
2089                 }
2090         }
2091         set_dev_node(&dd->pcidev->dev, dd->node);
2092
2093         ret = 0;
2094 done:
2095         return ret;
2096 }
2097
2098 void free_credit_return(struct hfi1_devdata *dd)
2099 {
2100         int i;
2101
2102         if (!dd->cr_base)
2103                 return;
2104         for (i = 0; i < node_affinity.num_possible_nodes; i++) {
2105                 if (dd->cr_base[i].va) {
2106                         dma_free_coherent(&dd->pcidev->dev,
2107                                           TXE_NUM_CONTEXTS *
2108                                           sizeof(struct credit_return),
2109                                           dd->cr_base[i].va,
2110                                           dd->cr_base[i].dma);
2111                 }
2112         }
2113         kfree(dd->cr_base);
2114         dd->cr_base = NULL;
2115 }