[ovs.git] / lib / ovs-atomic-i586.h

/*
 * Copyright (c) 2014 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* This header implements atomic operation primitives on 32-bit 586+ with GCC.
 */
#ifndef IN_OVS_ATOMIC_H
#error "This header should only be included indirectly via ovs-atomic.h."
#endif

#define OVS_ATOMIC_I586_IMPL 1

/*
 * These assumptions have been adopted from the x86_64 Memory model:
 *
 * - 1, 2, and 4 byte loads and stores are atomic on aligned memory.
 * - Loads are not reordered with other loads.
 * - Stores are not reordered with OLDER loads.
 *   - Loads may be reordered with OLDER stores to a different memory location,
 *     but not with OLDER stores to the same memory location.
 * - Stores are not reordered with other stores, except maybe for special
 *   instructions not emitted by compilers, or by the stores performed by
 *   a single fast string operation (e.g., "stos").  As long as the atomic
 *   stores are not combined with any other stores, even the allowed reordering
 *   of the stores by a single fast string operation is not a problem.
 * - Neither loads nor stores are reordered with locked instructions.
 * - Stores by a single processor are observed in the same order by all
 *   processors.
 * - (Unlocked) Stores from different processors are NOT ordered.
 * - Memory ordering obeys causality (memory ordering respects transitive
 *   visibility).
 * - Any two stores are seen in a consistent order by processors other than
 *   the those performing the stores.
 * - Locked instructions have total order.
 *
 * These rules imply that:
 *
 * - Locked instructions are not needed for aligned loads or stores to make
 *   them atomic for sizes upto 4 bytes.  8 byte objects need locked
 *   instructions.
 * - All stores have release semantics; none of the preceding stores or loads
 *   can be reordered with following stores.  Following loads could still be
 *   reordered to happen before the store, but that is not a violation of the
 *   release semantics.
 * - All loads from a given memory location have acquire semantics with
 *   respect to the stores on the same memory location; none of the following
 *   loads or stores can be reordered with the load.  Preceding stores to a
 *   different memory location MAY be reordered with the load, but that is not
 *   a violation of the acquire semantics (i.e., the loads and stores of two
 *   critical sections guarded by a different memory location can overlap).
 * - Locked instructions serve as CPU memory barriers by themselves.
 * - Locked stores implement the sequential consistency memory order.  Using
 *   locked instructions when seq_cst memory order is requested allows normal
 *   loads to observe the stores in the same (total) order without using CPU
 *   memory barrier after the loads.
 *
 * NOTE: Some older AMD Opteron processors have a bug that violates the
 * acquire semantics described above.  The bug manifests as an unlocked
 * read-modify-write operation following a "semaphore operation" operating
 * on data that existed before entering the critical section; i.e., the
 * preceding "semaphore operation" fails to function as an acquire barrier.
 * The affected CPUs are AMD family 15, models 32 to 63.
 *
 * Ref. http://support.amd.com/TechDocs/25759.pdf errata #147.
 */

/* Barriers. */

#define compiler_barrier()  asm volatile(" " : : : "memory")
#define cpu_barrier()  asm volatile("lock; addl $0,(%%esp)" ::: "memory", "cc")

/*
 * The 'volatile' keyword prevents the compiler from keeping the atomic
 * value in a register, and generates a new memory access for each atomic
 * operation.  This allows the implementations of memory_order_relaxed and
 * memory_order_consume to avoid issuing a compiler memory barrier, allowing
 * full optimization of all surrounding non-atomic variables.
 *
 * The placement of the 'volatile' keyword after the 'TYPE' below is highly
 * significant when the TYPE is a pointer type.  In that case we want the
 * pointer to be declared volatile, not the data type that is being pointed
 * at!
 *
 * Attribute aligned is used to tell the compiler to align 64-bit data
 * on a 8-byte boundary.  This allows more efficient atomic access, as the
 * the CPU guarantees such memory accesses to be atomic. */
#define ATOMIC(TYPE) TYPE volatile __attribute__((aligned(sizeof(TYPE))))

/* Memory ordering.  Must be passed in as a constant. */
typedef enum {
    memory_order_relaxed,
    memory_order_consume,
    memory_order_acquire,
    memory_order_release,
    memory_order_acq_rel,
    memory_order_seq_cst
} memory_order;
\f
#define ATOMIC_BOOL_LOCK_FREE 2
#define ATOMIC_CHAR_LOCK_FREE 2
#define ATOMIC_SHORT_LOCK_FREE 2
#define ATOMIC_INT_LOCK_FREE 2
#define ATOMIC_LONG_LOCK_FREE 2
#define ATOMIC_LLONG_LOCK_FREE 2
#define ATOMIC_POINTER_LOCK_FREE 2

#define IS_LOCKLESS_ATOMIC(OBJECT)                      \
    (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
\f
#define ATOMIC_VAR_INIT(VALUE) VALUE
#define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)

/*
 * The memory_model_relaxed does not need a compiler barrier, if the
 * atomic operation can otherwise be guaranteed to not be moved with
 * respect to other atomic operations on the same memory location.  Using
 * the 'volatile' keyword in the definition of the atomic types
 * accomplishes this, as memory accesses to volatile data may not be
 * optimized away, or be reordered with other volatile accesses.
 *
 * On x86 also memory_order_consume is automatic, and data dependency on a
 * volatile atomic variable means that the compiler optimizations should not
 * cause problems.  That is, the compiler should not speculate the value of
 * the atomic_read, as it is going to read it from the memory anyway.
 * This allows omiting the compiler memory barrier on atomic_reads with
 * memory_order_consume.  This matches the definition of
 * smp_read_barrier_depends() in Linux kernel as a nop for x86, and its usage
 * in rcu_dereference().
 *
 * We use this same logic below to choose inline assembly statements with or
 * without a compiler memory barrier.
 */
static inline void
atomic_compiler_barrier(memory_order order)
{
    if (order > memory_order_consume) {
        compiler_barrier();
    }
}

static inline void
atomic_thread_fence(memory_order order)
{
    if (order == memory_order_seq_cst) {
        cpu_barrier();
    } else {
        atomic_compiler_barrier(order);
    }
}

static inline void
atomic_signal_fence(memory_order order)
{
    atomic_compiler_barrier(order);
}

#define atomic_is_lock_free(OBJ)                \
    ((void) *(OBJ),                             \
     IS_LOCKLESS_ATOMIC(*(OBJ)) ? 2 : 0)

/* The 8-byte atomic exchange uses cmpxchg8b with the SRC (ax:dx) as
 * the expected value (bx:cx), which will get replaced by the current
 * value in the likely case it did not match, after which we keep
 * trying until the swap succeeds. */

#if defined(__PIC__)
/* ebx may not be clobbered when compiled with -fPIC, must save and
 * restore it.  Furthermore, 'DST' may be addressed via ebx, so the
 * address must be passed via a register so that it remains valid also
 * after changing ebx. */
#define atomic_exchange_8__(DST, SRC, CLOB)       \
    uint32_t temp____;                            \
                                                  \
    asm volatile("      movl %%ebx,%2 ;    "      \
                 "      movl %%eax,%%ebx ; "      \
                 "      movl %%edx,%%ecx ; "      \
                 "1:                       "      \
                 "lock; cmpxchg8b (%0);    "      \
                 "      jne 1b ;           "      \
                 "      movl %2,%%ebx ;    "      \
                 " # atomic_exchange_8__   "      \
                 : "+r" (DST),       /* 0 */      \
                   "+A" (SRC),       /* 1 */      \
                   "=mr" (temp____)  /* 2 */      \
                 :: "ecx", CLOB, "cc")

#else
#define atomic_exchange_8__(DST, SRC, CLOB)       \
    asm volatile("      movl %%eax,%%ebx ; "      \
                 "      movl %%edx,%%ecx ; "      \
                 "1:                       "      \
                 "lock; cmpxchg8b %0 ;     "      \
                 "      jne 1b ;           "      \
                 " # atomic_exchange_8__   "      \
                 : "+m" (*DST),      /* 0 */      \
                   "+A" (SRC)        /* 1 */      \
                 :: "ebx", "ecx", CLOB, "cc")
#endif

#define atomic_exchange__(DST, SRC, ORDER)        \
    ({                                            \
        typeof(DST) dst___ = (DST);               \
        typeof(*(DST)) src___ = (SRC);            \
                                                  \
        if ((ORDER) > memory_order_consume) {                  \
            if (sizeof(*(DST)) == 8) {                         \
                atomic_exchange_8__(dst___, src___, "memory"); \
            } else {                                           \
                asm volatile("xchg %1,%0 ;       "             \
                             "# atomic_exchange__"             \
                             : "+r" (src___),   /* 0 */        \
                               "+m" (*dst___)   /* 1 */        \
                             :: "memory");                     \
            }                                                  \
        } else {                                               \
            if (sizeof(*(DST)) == 8) {                         \
                atomic_exchange_8__(dst___, src___, "cc");     \
            } else {                                           \
                asm volatile("xchg %1,%0 ;       "             \
                             "# atomic_exchange__"             \
                             : "+r" (src___),    /* 0 */       \
                               "+m" (*dst___));  /* 1 */       \
            }                                                  \
        }                                                      \
        src___;                                                \
    })

#if defined(__SSE__)
/* SSE registers are 128-bit wide, and moving the lowest 64-bits of an SSE
 * register to proerly aligned memory is atomic.  See ATOMIC(TYPE) above. */
#define atomic_store_8__(DST, SRC)                 \
    asm volatile("movq %1,%0 ; # atomic_store_8__" \
                 : "=m" (*DST)   /* 0 */           \
                 : "x" (SRC))    /* 1, SSE */
#else
/* Locked 64-bit exchange is available on all i586 CPUs. */
#define atomic_store_8__(DST, SRC)          \
    atomic_exchange_8__(DST, SRC, "cc")
#endif

#define atomic_store_explicit(DST, SRC, ORDER)          \
    ({                                                  \
        typeof(DST) dst__ = (DST);                      \
        typeof(*(DST)) src__ = (SRC);                   \
                                                        \
        if ((ORDER) != memory_order_seq_cst) {          \
            atomic_compiler_barrier(ORDER);             \
            if (sizeof(*(DST)) == 8) {                  \
                atomic_store_8__(dst__, src__);         \
            } else {                                    \
                *dst__ = src__;                         \
            }                                           \
        } else {                                        \
            atomic_exchange__(dst__, src__, ORDER);     \
        }                                               \
        (void) 0;                                       \
    })
#define atomic_store(DST, SRC)                              \
    atomic_store_explicit(DST, SRC, memory_order_seq_cst)

#if defined(__SSE__)
/* SSE registers are 128-bit wide, and moving 64-bits from properly aligned
 * memory to an SSE register is atomic.  See ATOMIC(TYPE) above. */
#define atomic_read_8__(SRC, DST)               \
    ({                                          \
        typeof(*(DST)) res__;                   \
                                                \
        asm ("movq %1,%0 ; # atomic_read_8__"   \
             : "=x" (res__)   /* 0, SSE. */     \
             : "m" (*SRC));   /* 1 */           \
        *(DST) = res__;                         \
    })
#else
/* Must use locked cmpxchg8b (available on all i586 CPUs) if compiled w/o sse
 * support.  Compare '*DST' to a random value in bx:cx and returns the actual
 * value in ax:dx.  The registers bx and cx are only read, so they are not
 * clobbered. */
#define atomic_read_8__(SRC, DST)               \
    ({                                          \
        typeof(*(DST)) res__;                   \
                                                \
        asm ("      movl %%ebx,%%eax ; "        \
             "      movl %%ecx,%%edx ; "        \
             "lock; cmpxchg8b %1 ;     "        \
             "# atomic_read_8__        "        \
             : "=&A" (res__), /* 0 */           \
               "+m"  (*SRC)   /* 1 */           \
             : : "cc");                         \
        *(DST) = res__;                         \
    })
#endif

#define atomic_read_explicit(SRC, DST, ORDER)   \
    ({                                          \
        typeof(DST) dst__ = (DST);              \
        typeof(SRC) src__ = (SRC);              \
                                                \
        if (sizeof(*(DST)) <= 4) {              \
            *dst__ = *src__;                    \
        } else {                                \
            atomic_read_8__(SRC, DST);          \
        }                                       \
        atomic_compiler_barrier(ORDER);         \
        (void) 0;                               \
    })
#define atomic_read(SRC, DST)                               \
    atomic_read_explicit(SRC, DST, memory_order_seq_cst)

#if defined(__PIC__)
/* ebx may not be used as an input when compiled with -fPIC, must save
 * and restore it.  Furthermore, 'DST' may be addressed via ebx, so
 * the address must be passed via a register so that it remains valid
 * also after changing ebx. */
#define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB)         \
    asm volatile("      xchgl %%ebx,%3 ;    "                         \
                 "lock; cmpxchg8b (%1) ;    "                         \
                 "      xchgl %3,%%ebx ;    "                         \
                 "      sete %0             "                         \
                 "# atomic_compare_exchange_8__"                      \
                 : "=q" (RES),                 /* 0 */                \
                   "+r" (DST),                 /* 1 */                \
                   "+A" (EXP)                  /* 2 */                \
                 : "r" ((uint32_t)SRC),        /* 3 */                \
                   "c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */      \
                 : CLOB, "cc")
#else
#define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB)         \
    asm volatile("lock; cmpxchg8b %1 ; "                              \
                 "      sete %0        "                              \
                 "# atomic_compare_exchange_8__"                      \
                 : "=q" (RES),                 /* 0 */                \
                   "+m" (*DST),                /* 1 */                \
                   "+A" (EXP)                  /* 2 */                \
                 : "b" ((uint32_t)SRC),        /* 3 */                \
                   "c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */      \
                 : CLOB, "cc")
#endif

#define atomic_compare_exchange__(DST, EXP, SRC, RES, CLOB)           \
    asm volatile("lock; cmpxchg %3,%1 ; "                             \
                 "      sete    %0      "                             \
                 "# atomic_compare_exchange__"                        \
                 : "=q" (RES),           /* 0 */                      \
                   "+m" (*DST),          /* 1 */                      \
                   "+a" (EXP)            /* 2 */                      \
                 : "r" (SRC)             /* 3 */                      \
                 : CLOB, "cc")

/* ORD_FAIL is ignored, as atomic_compare_exchange__ already implements
 * at least as strong a barrier as allowed for ORD_FAIL in all cases. */
#define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORDER, ORD_FAIL) \
    ({                                                                  \
        typeof(DST) dst__ = (DST);                                      \
        typeof(DST) expp__ = (EXP);                                     \
        typeof(*(DST)) src__ = (SRC);                                   \
        typeof(*(DST)) exp__ = *expp__;                                 \
        uint8_t res__;                                                  \
        (void)ORD_FAIL;                                                 \
                                                                        \
        if ((ORDER) > memory_order_consume) {                           \
            if (sizeof(*(DST)) <= 4) {                                  \
                atomic_compare_exchange__(dst__, exp__, src__, res__,   \
                                          "memory");                    \
            } else {                                                    \
                atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
                                            "memory");                  \
            }                                                           \
        } else {                                                        \
            if (sizeof(*(DST)) <= 4) {                                  \
                atomic_compare_exchange__(dst__, exp__, src__, res__,   \
                                          "cc");                        \
            } else {                                                    \
                atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
                                            "cc");                      \
            }                                                           \
        }                                                               \
        if (!res__) {                                                   \
            *expp__ = exp__;                                            \
        }                                                               \
        (bool)res__;                                                    \
    })
#define atomic_compare_exchange_strong(DST, EXP, SRC)             \
    atomic_compare_exchange_strong_explicit(DST, EXP, SRC,        \
                                            memory_order_seq_cst, \
                                            memory_order_seq_cst)
#define atomic_compare_exchange_weak            \
    atomic_compare_exchange_strong
#define atomic_compare_exchange_weak_explicit   \
    atomic_compare_exchange_strong_explicit

#define atomic_add__(RMW, ARG, CLOB)            \
    asm volatile("lock; xadd %0,%1 ; "          \
                 "# atomic_add__     "          \
                 : "+r" (ARG),       /* 0 */    \
                   "+m" (*RMW)       /* 1 */    \
                 :: CLOB, "cc")

#define atomic_add_32__(RMW, ARG, ORIG, ORDER)     \
    ({                                             \
        typeof(RMW) rmw__ = (RMW);                 \
        typeof(*(RMW)) arg__ = (ARG);              \
                                                   \
        if ((ORDER) > memory_order_consume) {      \
            atomic_add__(rmw__, arg__, "memory");  \
        } else {                                   \
            atomic_add__(rmw__, arg__, "cc");      \
        }                                          \
        *(ORIG) = arg__;                           \
    })

/* We could use simple locked instructions if the original value was not
 * needed. */
#define atomic_op__(RMW, OP, ARG, ORIG, ORDER)              \
    ({                                                      \
        typeof(RMW) rmw__ = (RMW);                          \
        typeof(ARG) arg__ = (ARG);                                      \
                                                                        \
        typeof(*(RMW)) val__;                                           \
                                                                        \
        atomic_read_explicit(rmw__, &val__, memory_order_relaxed);      \
        do {                                                            \
        } while (!atomic_compare_exchange_weak_explicit(rmw__, &val__,  \
                                                        val__ OP arg__, \
                                                        ORDER,          \
                                                        memory_order_relaxed)); \
        *(ORIG) = val__;                                                \
    })

#define atomic_add_explicit(RMW, ARG, ORIG, ORDER)              \
    (sizeof(*(RMW)) <= 4                                        \
     ? atomic_add_32__(RMW, ARG, ORIG, ORDER)                   \
     : atomic_op__(RMW, +, ARG, ORIG, ORDER))
#define atomic_add(RMW, ARG, ORIG)                              \
    atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

#define atomic_sub_explicit(RMW, ARG, ORIG, ORDER)              \
    (sizeof(*(RMW)) <= 4                                        \
     ? atomic_add_32__(RMW, -(ARG), ORIG, ORDER)                \
     : atomic_op__(RMW, -, ARG, ORIG, ORDER))
#define atomic_sub(RMW, ARG, ORIG)                              \
    atomic_sub_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

#define atomic_or_explicit(RMW, ARG, ORIG, ORDER)       \
    atomic_op__(RMW, |, ARG, ORIG, ORDER)
#define atomic_or(RMW, ARG, ORIG)                              \
    atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

#define atomic_xor_explicit(RMW, ARG, ORIG, ORDER)      \
    atomic_op__(RMW, ^, ARG, ORIG, ORDER)
#define atomic_xor(RMW, ARG, ORIG)                              \
    atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

#define atomic_and_explicit(RMW, ARG, ORIG, ORDER)      \
    atomic_op__(RMW, &, ARG, ORIG, ORDER)
#define atomic_and(RMW, ARG, ORIG)                              \
    atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

\f
/* atomic_flag */

typedef ATOMIC(int) atomic_flag;
#define ATOMIC_FLAG_INIT { false }

#define atomic_flag_test_and_set_explicit(FLAG, ORDER)  \
    ((bool)atomic_exchange__(FLAG, 1, ORDER))
#define atomic_flag_test_and_set(FLAG)                                  \
    atomic_flag_test_and_set_explicit(FLAG, memory_order_seq_cst)

#define atomic_flag_clear_explicit(FLAG, ORDER) \
    atomic_store_explicit(FLAG, 0, ORDER)
#define atomic_flag_clear(FLAG)                                 \
    atomic_flag_clear_explicit(FLAG, memory_order_seq_cst)
Commit	Line	Data
105a9298 JR	1	/*
	2	* Copyright (c) 2014 Nicira, Inc.
	3	*
	4	* Licensed under the Apache License, Version 2.0 (the "License");
	5	* you may not use this file except in compliance with the License.
	6	* You may obtain a copy of the License at:
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	/* This header implements atomic operation primitives on 32-bit 586+ with GCC.
	18	*/
	19	#ifndef IN_OVS_ATOMIC_H
	20	#error "This header should only be included indirectly via ovs-atomic.h."
	21	#endif
	22
	23	#define OVS_ATOMIC_I586_IMPL 1
	24
	25	/*
	26	* These assumptions have been adopted from the x86_64 Memory model:
	27	*
	28	* - 1, 2, and 4 byte loads and stores are atomic on aligned memory.
	29	* - Loads are not reordered with other loads.
	30	* - Stores are not reordered with OLDER loads.
	31	* - Loads may be reordered with OLDER stores to a different memory location,
	32	* but not with OLDER stores to the same memory location.
	33	* - Stores are not reordered with other stores, except maybe for special
	34	* instructions not emitted by compilers, or by the stores performed by
	35	* a single fast string operation (e.g., "stos"). As long as the atomic
	36	* stores are not combined with any other stores, even the allowed reordering
	37	* of the stores by a single fast string operation is not a problem.
	38	* - Neither loads nor stores are reordered with locked instructions.
	39	* - Stores by a single processor are observed in the same order by all
	40	* processors.
	41	* - (Unlocked) Stores from different processors are NOT ordered.
	42	* - Memory ordering obeys causality (memory ordering respects transitive
	43	* visibility).
	44	* - Any two stores are seen in a consistent order by processors other than
	45	* the those performing the stores.
	46	* - Locked instructions have total order.
	47	*
	48	* These rules imply that:
	49	*
	50	* - Locked instructions are not needed for aligned loads or stores to make
	51	* them atomic for sizes upto 4 bytes. 8 byte objects need locked
	52	* instructions.
	53	* - All stores have release semantics; none of the preceding stores or loads
	54	* can be reordered with following stores. Following loads could still be
	55	* reordered to happen before the store, but that is not a violation of the
	56	* release semantics.
	57	* - All loads from a given memory location have acquire semantics with
	58	* respect to the stores on the same memory location; none of the following
	59	* loads or stores can be reordered with the load. Preceding stores to a
	60	* different memory location MAY be reordered with the load, but that is not
	61	* a violation of the acquire semantics (i.e., the loads and stores of two
	62	* critical sections guarded by a different memory location can overlap).
	63	* - Locked instructions serve as CPU memory barriers by themselves.
	64	* - Locked stores implement the sequential consistency memory order. Using
65	* locked instructions when seq_cst memory order is requested allows normal
66	* loads to observe the stores in the same (total) order without using CPU
67	* memory barrier after the loads.
68	*
69	* NOTE: Some older AMD Opteron processors have a bug that violates the
70	* acquire semantics described above. The bug manifests as an unlocked
71	* read-modify-write operation following a "semaphore operation" operating
72	* on data that existed before entering the critical section; i.e., the
73	* preceding "semaphore operation" fails to function as an acquire barrier.
74	* The affected CPUs are AMD family 15, models 32 to 63.
75	*
76	* Ref. http://support.amd.com/TechDocs/25759.pdf errata #147.
77	*/
78
79	/* Barriers. */
80
81	#define compiler_barrier() asm volatile(" " : : : "memory")
82	#define cpu_barrier() asm volatile("lock; addl $0,(%%esp)" ::: "memory", "cc")
83
84	/*
85	* The 'volatile' keyword prevents the compiler from keeping the atomic
86	* value in a register, and generates a new memory access for each atomic
87	* operation. This allows the implementations of memory_order_relaxed and
88	* memory_order_consume to avoid issuing a compiler memory barrier, allowing
89	* full optimization of all surrounding non-atomic variables.
90	*
91	* The placement of the 'volatile' keyword after the 'TYPE' below is highly
92	* significant when the TYPE is a pointer type. In that case we want the
93	* pointer to be declared volatile, not the data type that is being pointed
94	* at!
55eebc01 JR	95	*
	96	* Attribute aligned is used to tell the compiler to align 64-bit data
	97	* on a 8-byte boundary. This allows more efficient atomic access, as the
	98	* the CPU guarantees such memory accesses to be atomic. */
	99	#define ATOMIC(TYPE) TYPE volatile __attribute__((aligned(sizeof(TYPE))))
105a9298 JR	100
	101	/* Memory ordering. Must be passed in as a constant. */
	102	typedef enum {
	103	memory_order_relaxed,
	104	memory_order_consume,
	105	memory_order_acquire,
	106	memory_order_release,
	107	memory_order_acq_rel,
	108	memory_order_seq_cst
	109	} memory_order;
	110	\f
	111	#define ATOMIC_BOOL_LOCK_FREE 2
	112	#define ATOMIC_CHAR_LOCK_FREE 2
	113	#define ATOMIC_SHORT_LOCK_FREE 2
	114	#define ATOMIC_INT_LOCK_FREE 2
	115	#define ATOMIC_LONG_LOCK_FREE 2
	116	#define ATOMIC_LLONG_LOCK_FREE 2
	117	#define ATOMIC_POINTER_LOCK_FREE 2
	118
	119	#define IS_LOCKLESS_ATOMIC(OBJECT) \
	120	(sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
	121	\f
	122	#define ATOMIC_VAR_INIT(VALUE) VALUE
	123	#define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)
	124
	125	/*
	126	* The memory_model_relaxed does not need a compiler barrier, if the
	127	* atomic operation can otherwise be guaranteed to not be moved with
	128	* respect to other atomic operations on the same memory location. Using
	129	* the 'volatile' keyword in the definition of the atomic types
	130	* accomplishes this, as memory accesses to volatile data may not be
	131	* optimized away, or be reordered with other volatile accesses.
	132	*
	133	* On x86 also memory_order_consume is automatic, and data dependency on a
	134	* volatile atomic variable means that the compiler optimizations should not
	135	* cause problems. That is, the compiler should not speculate the value of
	136	* the atomic_read, as it is going to read it from the memory anyway.
	137	* This allows omiting the compiler memory barrier on atomic_reads with
	138	* memory_order_consume. This matches the definition of
	139	* smp_read_barrier_depends() in Linux kernel as a nop for x86, and its usage
	140	* in rcu_dereference().
	141	*
	142	* We use this same logic below to choose inline assembly statements with or
	143	* without a compiler memory barrier.
	144	*/
	145	static inline void
	146	atomic_compiler_barrier(memory_order order)
	147	{
	148	if (order > memory_order_consume) {
	149	compiler_barrier();
	150	}
	151	}
	152
	153	static inline void
	154	atomic_thread_fence(memory_order order)
	155	{
	156	if (order == memory_order_seq_cst) {
	157	cpu_barrier();
	158	} else {
	159	atomic_compiler_barrier(order);
	160	}
	161	}
	162
	163	static inline void
164	atomic_signal_fence(memory_order order)
165	{
166	atomic_compiler_barrier(order);
167	}
168
169	#define atomic_is_lock_free(OBJ) \
170	((void) *(OBJ), \
171	IS_LOCKLESS_ATOMIC(*(OBJ)) ? 2 : 0)
172
173	/* The 8-byte atomic exchange uses cmpxchg8b with the SRC (ax:dx) as
174	* the expected value (bx:cx), which will get replaced by the current
175	* value in the likely case it did not match, after which we keep
176	* trying until the swap succeeds. */
177
178	#if defined(__PIC__)
179	/* ebx may not be clobbered when compiled with -fPIC, must save and
180	* restore it. Furthermore, 'DST' may be addressed via ebx, so the
181	* address must be passed via a register so that it remains valid also
182	* after changing ebx. */
183	#define atomic_exchange_8__(DST, SRC, CLOB) \
184	uint32_t temp____; \
185	\
186	asm volatile(" movl %%ebx,%2 ; " \
187	" movl %%eax,%%ebx ; " \
188	" movl %%edx,%%ecx ; " \
189	"1: " \
190	"lock; cmpxchg8b (%0); " \
191	" jne 1b ; " \
192	" movl %2,%%ebx ; " \
193	" # atomic_exchange_8__ " \
194	: "+r" (DST), /* 0 */ \
195	"+A" (SRC), /* 1 */ \
196	"=mr" (temp____) /* 2 */ \
197	:: "ecx", CLOB, "cc")
198
199	#else
200	#define atomic_exchange_8__(DST, SRC, CLOB) \
201	asm volatile(" movl %%eax,%%ebx ; " \
202	" movl %%edx,%%ecx ; " \
203	"1: " \
204	"lock; cmpxchg8b %0 ; " \
205	" jne 1b ; " \
206	" # atomic_exchange_8__ " \
207	: "+m" (DST), / 0 */ \
208	"+A" (SRC) /* 1 */ \
209	:: "ebx", "ecx", CLOB, "cc")
210	#endif
211
212	#define atomic_exchange__(DST, SRC, ORDER) \
213	({ \
214	typeof(DST) dst___ = (DST); \
0b83904f	215	typeof(*(DST)) src___ = (SRC); \
105a9298 JR	216	\
105a9298 JR	217	if ((ORDER) > memory_order_consume) { \
0b83904f	218	if (sizeof(*(DST)) == 8) { \
105a9298 JR	219	atomic_exchange_8__(dst___, src___, "memory"); \
	220	} else { \
	221	asm volatile("xchg %1,%0 ; " \
	222	"# atomic_exchange__" \
	223	: "+r" (src___), /* 0 */ \
	224	"+m" (dst___) / 1 */ \
	225	:: "memory"); \
	226	} \
	227	} else { \
0b83904f	228	if (sizeof(*(DST)) == 8) { \
105a9298 JR	229	atomic_exchange_8__(dst___, src___, "cc"); \
	230	} else { \
	231	asm volatile("xchg %1,%0 ; " \
	232	"# atomic_exchange__" \
	233	: "+r" (src___), /* 0 */ \
	234	"+m" (dst___)); / 1 */ \
	235	} \
	236	} \
	237	src___; \
	238	})
	239
55eebc01 JR	240	#if defined(__SSE__)
	241	/* SSE registers are 128-bit wide, and moving the lowest 64-bits of an SSE
	242	* register to proerly aligned memory is atomic. See ATOMIC(TYPE) above. */
	243	#define atomic_store_8__(DST, SRC) \
	244	asm volatile("movq %1,%0 ; # atomic_store_8__" \
	245	: "=m" (DST) / 0 */ \
	246	: "x" (SRC)) /* 1, SSE */
	247	#else
	248	/* Locked 64-bit exchange is available on all i586 CPUs. */
	249	#define atomic_store_8__(DST, SRC) \
	250	atomic_exchange_8__(DST, SRC, "cc")
	251	#endif
	252
105a9298 JR	253	#define atomic_store_explicit(DST, SRC, ORDER) \
	254	({ \
	255	typeof(DST) dst__ = (DST); \
0b83904f	256	typeof(*(DST)) src__ = (SRC); \
105a9298	257	\
55eebc01	258	if ((ORDER) != memory_order_seq_cst) { \
105a9298	259	atomic_compiler_barrier(ORDER); \
55eebc01 JR	260	if (sizeof(*(DST)) == 8) { \
	261	atomic_store_8__(dst__, src__); \
	262	} else { \
	263	*dst__ = src__; \
	264	} \
105a9298 JR	265	} else { \
	266	atomic_exchange__(dst__, src__, ORDER); \
	267	} \
	268	(void) 0; \
	269	})
55eebc01	270	#define atomic_store(DST, SRC) \
105a9298 JR	271	atomic_store_explicit(DST, SRC, memory_order_seq_cst)
105a9298 JR	272
55eebc01 JR	273	#if defined(__SSE__)
	274	/* SSE registers are 128-bit wide, and moving 64-bits from properly aligned
	275	* memory to an SSE register is atomic. See ATOMIC(TYPE) above. */
	276	#define atomic_read_8__(SRC, DST) \
	277	({ \
	278	typeof(*(DST)) res__; \
	279	\
	280	asm ("movq %1,%0 ; # atomic_read_8__" \
	281	: "=x" (res__) /* 0, SSE. */ \
	282	: "m" (SRC)); / 1 */ \
	283	*(DST) = res__; \
	284	})
	285	#else
	286	/* Must use locked cmpxchg8b (available on all i586 CPUs) if compiled w/o sse
	287	* support. Compare '*DST' to a random value in bx:cx and returns the actual
	288	* value in ax:dx. The registers bx and cx are only read, so they are not
	289	* clobbered. */
	290	#define atomic_read_8__(SRC, DST) \
	291	({ \
	292	typeof(*(DST)) res__; \
	293	\
	294	asm (" movl %%ebx,%%eax ; " \
	295	" movl %%ecx,%%edx ; " \
	296	"lock; cmpxchg8b %1 ; " \
	297	"# atomic_read_8__ " \
	298	: "=&A" (res__), /* 0 */ \
	299	"+m" (SRC) / 1 */ \
	300	: : "cc"); \
	301	*(DST) = res__; \
	302	})
	303	#endif
	304
	305	#define atomic_read_explicit(SRC, DST, ORDER) \
	306	({ \
	307	typeof(DST) dst__ = (DST); \
	308	typeof(SRC) src__ = (SRC); \
	309	\
	310	if (sizeof(*(DST)) <= 4) { \
	311	dst__ = src__; \
	312	} else { \
	313	atomic_read_8__(SRC, DST); \
	314	} \
	315	atomic_compiler_barrier(ORDER); \
	316	(void) 0; \
105a9298	317	})
55eebc01	318	#define atomic_read(SRC, DST) \
105a9298 JR	319	atomic_read_explicit(SRC, DST, memory_order_seq_cst)
	320
	321	#if defined(__PIC__)
	322	/* ebx may not be used as an input when compiled with -fPIC, must save
	323	* and restore it. Furthermore, 'DST' may be addressed via ebx, so
	324	* the address must be passed via a register so that it remains valid
	325	* also after changing ebx. */
	326	#define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB) \
	327	asm volatile(" xchgl %%ebx,%3 ; " \
	328	"lock; cmpxchg8b (%1) ; " \
	329	" xchgl %3,%%ebx ; " \
	330	" sete %0 " \
	331	"# atomic_compare_exchange_8__" \
	332	: "=q" (RES), /* 0 */ \
	333	"+r" (DST), /* 1 */ \
	334	"+A" (EXP) /* 2 */ \
	335	: "r" ((uint32_t)SRC), /* 3 */ \
	336	"c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */ \
	337	: CLOB, "cc")
	338	#else
	339	#define atomic_compare_exchange_8__(DST, EXP, SRC, RES, CLOB) \
	340	asm volatile("lock; cmpxchg8b %1 ; " \
	341	" sete %0 " \
	342	"# atomic_compare_exchange_8__" \
	343	: "=q" (RES), /* 0 */ \
	344	"+m" (DST), / 1 */ \
	345	"+A" (EXP) /* 2 */ \
	346	: "b" ((uint32_t)SRC), /* 3 */ \
	347	"c" ((uint32_t)((uint64_t)SRC >> 32)) /* 4 */ \
	348	: CLOB, "cc")
	349	#endif
	350
	351	#define atomic_compare_exchange__(DST, EXP, SRC, RES, CLOB) \
	352	asm volatile("lock; cmpxchg %3,%1 ; " \
	353	" sete %0 " \
	354	"# atomic_compare_exchange__" \
	355	: "=q" (RES), /* 0 */ \
	356	"+m" (DST), / 1 */ \
	357	"+a" (EXP) /* 2 */ \
	358	: "r" (SRC) /* 3 */ \
	359	: CLOB, "cc")
	360
	361	/* ORD_FAIL is ignored, as atomic_compare_exchange__ already implements
	362	* at least as strong a barrier as allowed for ORD_FAIL in all cases. */
	363	#define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORDER, ORD_FAIL) \
	364	({ \
	365	typeof(DST) dst__ = (DST); \
	366	typeof(DST) expp__ = (EXP); \
0b83904f JR	367	typeof(*(DST)) src__ = (SRC); \
0b83904f JR	368	typeof((DST)) exp__ = expp__; \
105a9298 JR	369	uint8_t res__; \
	370	(void)ORD_FAIL; \
	371	\
	372	if ((ORDER) > memory_order_consume) { \
0b83904f	373	if (sizeof(*(DST)) <= 4) { \
105a9298 JR	374	atomic_compare_exchange__(dst__, exp__, src__, res__, \
	375	"memory"); \
	376	} else { \
	377	atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
	378	"memory"); \
	379	} \
	380	} else { \
0b83904f	381	if (sizeof(*(DST)) <= 4) { \
105a9298 JR	382	atomic_compare_exchange__(dst__, exp__, src__, res__, \
	383	"cc"); \
	384	} else { \
	385	atomic_compare_exchange_8__(dst__, exp__, src__, res__, \
	386	"cc"); \
	387	} \
	388	} \
	389	if (!res__) { \
	390	*expp__ = exp__; \
	391	} \
	392	(bool)res__; \
	393	})
	394	#define atomic_compare_exchange_strong(DST, EXP, SRC) \
	395	atomic_compare_exchange_strong_explicit(DST, EXP, SRC, \
	396	memory_order_seq_cst, \
	397	memory_order_seq_cst)
	398	#define atomic_compare_exchange_weak \
	399	atomic_compare_exchange_strong
	400	#define atomic_compare_exchange_weak_explicit \
	401	atomic_compare_exchange_strong_explicit
	402
	403	#define atomic_add__(RMW, ARG, CLOB) \
	404	asm volatile("lock; xadd %0,%1 ; " \
	405	"# atomic_add__ " \
	406	: "+r" (ARG), /* 0 */ \
	407	"+m" (RMW) / 1 */ \
	408	:: CLOB, "cc")
	409
	410	#define atomic_add_32__(RMW, ARG, ORIG, ORDER) \
	411	({ \
	412	typeof(RMW) rmw__ = (RMW); \
0b83904f	413	typeof(*(RMW)) arg__ = (ARG); \
105a9298 JR	414	\
	415	if ((ORDER) > memory_order_consume) { \
	416	atomic_add__(rmw__, arg__, "memory"); \
	417	} else { \
	418	atomic_add__(rmw__, arg__, "cc"); \
	419	} \
	420	*(ORIG) = arg__; \
	421	})
	422
	423	/* We could use simple locked instructions if the original value was not
	424	* needed. */
	425	#define atomic_op__(RMW, OP, ARG, ORIG, ORDER) \
	426	({ \
	427	typeof(RMW) rmw__ = (RMW); \
	428	typeof(ARG) arg__ = (ARG); \
	429	\
0b83904f	430	typeof(*(RMW)) val__; \
105a9298 JR	431	\
	432	atomic_read_explicit(rmw__, &val__, memory_order_relaxed); \
	433	do { \
	434	} while (!atomic_compare_exchange_weak_explicit(rmw__, &val__, \
	435	val__ OP arg__, \
	436	ORDER, \
	437	memory_order_relaxed)); \
	438	*(ORIG) = val__; \
	439	})
	440
	441	#define atomic_add_explicit(RMW, ARG, ORIG, ORDER) \
0b83904f	442	(sizeof(*(RMW)) <= 4 \
105a9298 JR	443	? atomic_add_32__(RMW, ARG, ORIG, ORDER) \
	444	: atomic_op__(RMW, +, ARG, ORIG, ORDER))
	445	#define atomic_add(RMW, ARG, ORIG) \
	446	atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	447
	448	#define atomic_sub_explicit(RMW, ARG, ORIG, ORDER) \
0b83904f	449	(sizeof(*(RMW)) <= 4 \
105a9298 JR	450	? atomic_add_32__(RMW, -(ARG), ORIG, ORDER) \
	451	: atomic_op__(RMW, -, ARG, ORIG, ORDER))
	452	#define atomic_sub(RMW, ARG, ORIG) \
	453	atomic_sub_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	454
	455	#define atomic_or_explicit(RMW, ARG, ORIG, ORDER) \
	456	atomic_op__(RMW, \|, ARG, ORIG, ORDER)
0b83904f	457	#define atomic_or(RMW, ARG, ORIG) \
105a9298 JR	458	atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	459
	460	#define atomic_xor_explicit(RMW, ARG, ORIG, ORDER) \
	461	atomic_op__(RMW, ^, ARG, ORIG, ORDER)
	462	#define atomic_xor(RMW, ARG, ORIG) \
	463	atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	464
	465	#define atomic_and_explicit(RMW, ARG, ORIG, ORDER) \
	466	atomic_op__(RMW, &, ARG, ORIG, ORDER)
	467	#define atomic_and(RMW, ARG, ORIG) \
	468	atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	469
	470	\f
	471	/* atomic_flag */
	472
	473	typedef ATOMIC(int) atomic_flag;
	474	#define ATOMIC_FLAG_INIT { false }
	475
	476	#define atomic_flag_test_and_set_explicit(FLAG, ORDER) \
	477	((bool)atomic_exchange__(FLAG, 1, ORDER))
	478	#define atomic_flag_test_and_set(FLAG) \
	479	atomic_flag_test_and_set_explicit(FLAG, memory_order_seq_cst)
	480
	481	#define atomic_flag_clear_explicit(FLAG, ORDER) \
	482	atomic_store_explicit(FLAG, 0, ORDER)
	483	#define atomic_flag_clear(FLAG) \
	484	atomic_flag_clear_explicit(FLAG, memory_order_seq_cst)