kernel/sched/membarrier.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   4  *
   5  * membarrier system call
   6  */
   7 #include "sched.h"
   8
   9 /*
  10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  11  * except MEMBARRIER_CMD_QUERY.
  12  */
  13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
  15         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
  16         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  17 #else
  18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  19 #endif
  20
  21 #ifdef CONFIG_RSEQ
  22 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK           \
  23         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ                  \
  24         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
  25 #else
  26 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK   0
  27 #endif
  28
  29 #define MEMBARRIER_CMD_BITMASK                                          \
  30         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
  31         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
  32         | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
  33         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
  34         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  35
  36 static void ipi_mb(void *info)
  37 {
  38         smp_mb();       /* IPIs should be serializing but paranoid. */
  39 }
  40
  41 static void ipi_rseq(void *info)
  42 {
  43         rseq_preempt(current);
  44 }
  45
  46 static void ipi_sync_rq_state(void *info)
  47 {
  48         struct mm_struct *mm = (struct mm_struct *) info;
  49
  50         if (current->mm != mm)
  51                 return;
  52         this_cpu_write(runqueues.membarrier_state,
  53                        atomic_read(&mm->membarrier_state));
  54         /*
  55          * Issue a memory barrier after setting
  56          * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
  57          * guarantee that no memory access following registration is reordered
  58          * before registration.
  59          */
  60         smp_mb();
  61 }
  62
  63 void membarrier_exec_mmap(struct mm_struct *mm)
  64 {
  65         /*
  66          * Issue a memory barrier before clearing membarrier_state to
  67          * guarantee that no memory access prior to exec is reordered after
  68          * clearing this state.
  69          */
  70         smp_mb();
  71         atomic_set(&mm->membarrier_state, 0);
  72         /*
  73          * Keep the runqueue membarrier_state in sync with this mm
  74          * membarrier_state.
  75          */
  76         this_cpu_write(runqueues.membarrier_state, 0);
  77 }
  78
  79 void membarrier_update_current_mm(struct mm_struct *next_mm)
  80 {
  81         struct rq *rq = this_rq();
  82         int membarrier_state = 0;
  83
  84         if (next_mm)
  85                 membarrier_state = atomic_read(&next_mm->membarrier_state);
  86         if (READ_ONCE(rq->membarrier_state) == membarrier_state)
  87                 return;
  88         WRITE_ONCE(rq->membarrier_state, membarrier_state);
  89 }
  90
  91 static int membarrier_global_expedited(void)
  92 {
  93         int cpu;
  94         cpumask_var_t tmpmask;
  95
  96         if (num_online_cpus() == 1)
  97                 return 0;
  98
  99         /*
 100          * Matches memory barriers around rq->curr modification in
 101          * scheduler.
 102          */
 103         smp_mb();       /* system call entry is not a mb. */
 104
 105         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 106                 return -ENOMEM;
 107
 108         cpus_read_lock();
 109         rcu_read_lock();
 110         for_each_online_cpu(cpu) {
 111                 struct task_struct *p;
 112
 113                 /*
 114                  * Skipping the current CPU is OK even through we can be
 115                  * migrated at any point. The current CPU, at the point
 116                  * where we read raw_smp_processor_id(), is ensured to
 117                  * be in program order with respect to the caller
 118                  * thread. Therefore, we can skip this CPU from the
 119                  * iteration.
 120                  */
 121                 if (cpu == raw_smp_processor_id())
 122                         continue;
 123
 124                 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
 125                     MEMBARRIER_STATE_GLOBAL_EXPEDITED))
 126                         continue;
 127
 128                 /*
 129                  * Skip the CPU if it runs a kernel thread which is not using
 130                  * a task mm.
 131                  */
 132                 p = rcu_dereference(cpu_rq(cpu)->curr);
 133                 if (!p->mm)
 134                         continue;
 135
 136                 __cpumask_set_cpu(cpu, tmpmask);
 137         }
 138         rcu_read_unlock();
 139
 140         preempt_disable();
 141         smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 142         preempt_enable();
 143
 144         free_cpumask_var(tmpmask);
 145         cpus_read_unlock();
 146
 147         /*
 148          * Memory barrier on the caller thread _after_ we finished
 149          * waiting for the last IPI. Matches memory barriers around
 150          * rq->curr modification in scheduler.
 151          */
 152         smp_mb();       /* exit from system call is not a mb */
 153         return 0;
 154 }
 155
 156 static int membarrier_private_expedited(int flags, int cpu_id)
 157 {
 158         cpumask_var_t tmpmask;
 159         struct mm_struct *mm = current->mm;
 160         smp_call_func_t ipi_func = ipi_mb;
 161
 162         if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
 163                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 164                         return -EINVAL;
 165                 if (!(atomic_read(&mm->membarrier_state) &
 166                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 167                         return -EPERM;
 168         } else if (flags == MEMBARRIER_FLAG_RSEQ) {
 169                 if (!IS_ENABLED(CONFIG_RSEQ))
 170                         return -EINVAL;
 171                 if (!(atomic_read(&mm->membarrier_state) &
 172                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
 173                         return -EPERM;
 174                 ipi_func = ipi_rseq;
 175         } else {
 176                 WARN_ON_ONCE(flags);
 177                 if (!(atomic_read(&mm->membarrier_state) &
 178                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 179                         return -EPERM;
 180         }
 181
 182         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
 183                 return 0;
 184
 185         /*
 186          * Matches memory barriers around rq->curr modification in
 187          * scheduler.
 188          */
 189         smp_mb();       /* system call entry is not a mb. */
 190
 191         if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 192                 return -ENOMEM;
 193
 194         cpus_read_lock();
 195
 196         if (cpu_id >= 0) {
 197                 struct task_struct *p;
 198
 199                 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
 200                         goto out;
 201                 if (cpu_id == raw_smp_processor_id())
 202                         goto out;
 203                 rcu_read_lock();
 204                 p = rcu_dereference(cpu_rq(cpu_id)->curr);
 205                 if (!p || p->mm != mm) {
 206                         rcu_read_unlock();
 207                         goto out;
 208                 }
 209                 rcu_read_unlock();
 210         } else {
 211                 int cpu;
 212
 213                 rcu_read_lock();
 214                 for_each_online_cpu(cpu) {
 215                         struct task_struct *p;
 216
 217                         /*
 218                          * Skipping the current CPU is OK even through we can be
 219                          * migrated at any point. The current CPU, at the point
 220                          * where we read raw_smp_processor_id(), is ensured to
 221                          * be in program order with respect to the caller
 222                          * thread. Therefore, we can skip this CPU from the
 223                          * iteration.
 224                          */
 225                         if (cpu == raw_smp_processor_id())
 226                                 continue;
 227                         p = rcu_dereference(cpu_rq(cpu)->curr);
 228                         if (p && p->mm == mm)
 229                                 __cpumask_set_cpu(cpu, tmpmask);
 230                 }
 231                 rcu_read_unlock();
 232         }
 233
 234         preempt_disable();
 235         if (cpu_id >= 0)
 236                 smp_call_function_single(cpu_id, ipi_func, NULL, 1);
 237         else
 238                 smp_call_function_many(tmpmask, ipi_func, NULL, 1);
 239         preempt_enable();
 240
 241 out:
 242         if (cpu_id < 0)
 243                 free_cpumask_var(tmpmask);
 244         cpus_read_unlock();
 245
 246         /*
 247          * Memory barrier on the caller thread _after_ we finished
 248          * waiting for the last IPI. Matches memory barriers around
 249          * rq->curr modification in scheduler.
 250          */
 251         smp_mb();       /* exit from system call is not a mb */
 252
 253         return 0;
 254 }
 255
 256 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
 257 {
 258         int membarrier_state = atomic_read(&mm->membarrier_state);
 259         cpumask_var_t tmpmask;
 260         int cpu;
 261
 262         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
 263                 this_cpu_write(runqueues.membarrier_state, membarrier_state);
 264
 265                 /*
 266                  * For single mm user, we can simply issue a memory barrier
 267                  * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
 268                  * mm and in the current runqueue to guarantee that no memory
 269                  * access following registration is reordered before
 270                  * registration.
 271                  */
 272                 smp_mb();
 273                 return 0;
 274         }
 275
 276         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 277                 return -ENOMEM;
 278
 279         /*
 280          * For mm with multiple users, we need to ensure all future
 281          * scheduler executions will observe @mm's new membarrier
 282          * state.
 283          */
 284         synchronize_rcu();
 285
 286         /*
 287          * For each cpu runqueue, if the task's mm match @mm, ensure that all
 288          * @mm's membarrier state set bits are also set in in the runqueue's
 289          * membarrier state. This ensures that a runqueue scheduling
 290          * between threads which are users of @mm has its membarrier state
 291          * updated.
 292          */
 293         cpus_read_lock();
 294         rcu_read_lock();
 295         for_each_online_cpu(cpu) {
 296                 struct rq *rq = cpu_rq(cpu);
 297                 struct task_struct *p;
 298
 299                 p = rcu_dereference(rq->curr);
 300                 if (p && p->mm == mm)
 301                         __cpumask_set_cpu(cpu, tmpmask);
 302         }
 303         rcu_read_unlock();
 304
 305         preempt_disable();
 306         smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
 307         preempt_enable();
 308
 309         free_cpumask_var(tmpmask);
 310         cpus_read_unlock();
 311
 312         return 0;
 313 }
 314
 315 static int membarrier_register_global_expedited(void)
 316 {
 317         struct task_struct *p = current;
 318         struct mm_struct *mm = p->mm;
 319         int ret;
 320
 321         if (atomic_read(&mm->membarrier_state) &
 322             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 323                 return 0;
 324         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 325         ret = sync_runqueues_membarrier_state(mm);
 326         if (ret)
 327                 return ret;
 328         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 329                   &mm->membarrier_state);
 330
 331         return 0;
 332 }
 333
 334 static int membarrier_register_private_expedited(int flags)
 335 {
 336         struct task_struct *p = current;
 337         struct mm_struct *mm = p->mm;
 338         int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
 339             set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
 340             ret;
 341
 342         if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
 343                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 344                         return -EINVAL;
 345                 ready_state =
 346                         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 347         } else if (flags == MEMBARRIER_FLAG_RSEQ) {
 348                 if (!IS_ENABLED(CONFIG_RSEQ))
 349                         return -EINVAL;
 350                 ready_state =
 351                         MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
 352         } else {
 353                 WARN_ON_ONCE(flags);
 354         }
 355
 356         /*
 357          * We need to consider threads belonging to different thread
 358          * groups, which use the same mm. (CLONE_VM but not
 359          * CLONE_THREAD).
 360          */
 361         if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 362                 return 0;
 363         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 364                 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
 365         if (flags & MEMBARRIER_FLAG_RSEQ)
 366                 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
 367         atomic_or(set_state, &mm->membarrier_state);
 368         ret = sync_runqueues_membarrier_state(mm);
 369         if (ret)
 370                 return ret;
 371         atomic_or(ready_state, &mm->membarrier_state);
 372
 373         return 0;
 374 }
 375
 376 /**
 377  * sys_membarrier - issue memory barriers on a set of threads
 378  * @cmd:    Takes command values defined in enum membarrier_cmd.
 379  * @flags:  Currently needs to be 0 for all commands other than
 380  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
 381  *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
 382  *          contains the CPU on which to interrupt (= restart)
 383  *          the RSEQ critical section.
 384  * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
 385  *          RSEQ CS should be interrupted (@cmd must be
 386  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
 387  *
 388  * If this system call is not implemented, -ENOSYS is returned. If the
 389  * command specified does not exist, not available on the running
 390  * kernel, or if the command argument is invalid, this system call
 391  * returns -EINVAL. For a given command, with flags argument set to 0,
 392  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
 393  * always return the same value until reboot. In addition, it can return
 394  * -ENOMEM if there is not enough memory available to perform the system
 395  * call.
 396  *
 397  * All memory accesses performed in program order from each targeted thread
 398  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 399  * the semantic "barrier()" to represent a compiler barrier forcing memory
 400  * accesses to be performed in program order across the barrier, and
 401  * smp_mb() to represent explicit memory barriers forcing full memory
 402  * ordering across the barrier, we have the following ordering table for
 403  * each pair of barrier(), sys_membarrier() and smp_mb():
 404  *
 405  * The pair ordering is detailed as (O: ordered, X: not ordered):
 406  *
 407  *                        barrier()   smp_mb() sys_membarrier()
 408  *        barrier()          X           X            O
 409  *        smp_mb()           X           O            O
 410  *        sys_membarrier()   O           O            O
 411  */
 412 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
 413 {
 414         switch (cmd) {
 415         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
 416                 if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
 417                         return -EINVAL;
 418                 break;
 419         default:
 420                 if (unlikely(flags))
 421                         return -EINVAL;
 422         }
 423
 424         if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
 425                 cpu_id = -1;
 426
 427         switch (cmd) {
 428         case MEMBARRIER_CMD_QUERY:
 429         {
 430                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 431
 432                 if (tick_nohz_full_enabled())
 433                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 434                 return cmd_mask;
 435         }
 436         case MEMBARRIER_CMD_GLOBAL:
 437                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 438                 if (tick_nohz_full_enabled())
 439                         return -EINVAL;
 440                 if (num_online_cpus() > 1)
 441                         synchronize_rcu();
 442                 return 0;
 443         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 444                 return membarrier_global_expedited();
 445         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 446                 return membarrier_register_global_expedited();
 447         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 448                 return membarrier_private_expedited(0, cpu_id);
 449         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 450                 return membarrier_register_private_expedited(0);
 451         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 452                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
 453         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 454                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 455         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
 456                 return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
 457         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
 458                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
 459         default:
 460                 return -EINVAL;
 461         }
 462 }