[mirror_ovs.git] / lib / ovs-atomic-x86_64.h

/*
 * Copyright (c) 2014 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* This header implements atomic operation primitives on x86_64 with GCC. */
#ifndef IN_OVS_ATOMIC_H
#error "This header should only be included indirectly via ovs-atomic.h."
#endif

#define OVS_ATOMIC_X86_64_IMPL 1

/*
 * x86_64 Memory model (http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.html):
 *
 * - 1, 2, 4, and 8 byte loads and stores are atomic on aligned memory.
 * - Loads are not reordered with other loads.
 * - Stores are not reordered with OLDER loads.
 *   - Loads may be reordered with OLDER stores to a different memory location,
 *     but not with OLDER stores to the same memory location.
 * - Stores are not reordered with other stores, except for special
 *   instructions (CLFLUSH, streaming stores, fast string operations).
 *   Most of these are not emitted by compilers, and as long as the
 *   atomic stores are not combined with any other stores, even the allowed
 *   reordering of the stores by a single fast string operation (e.g., "stos")
 *   is not a problem.
 * - Neither loads nor stores are reordered with locked instructions.
 * - Loads cannot pass earlier LFENCE or MFENCE instructions.
 * - Stores cannot pass earlier LFENCE, SFENCE, or MFENCE instructions.
 * - LFENCE instruction cannot pass earlier loads.
 * - SFENCE instruction cannot pass earlier stores.
 * - MFENCE instruction cannot pass earlier loads or stores.
 * - Stores by a single processor are observed in the same order by all
 *   processors.
 * - (Unlocked) Stores from different processors are NOT ordered.
 * - Memory ordering obeys causality (memory ordering respects transitive
 *   visibility).
 * - Any two stores are seen in a consistent order by processors other than
 *   the those performing the stores.
 * - Locked instructions have total order.
 *
 * These rules imply that:
 *
 * - Locked instructions are not needed for aligned loads or stores to make
 *   them atomic.
 * - All stores have release semantics; none of the preceding stores or loads
 *   can be reordered with following stores.  Following loads could still be
 *   reordered to happen before the store, but that is not a violation of the
 *   release semantics.
 * - All loads from a given memory location have acquire semantics with
 *   respect to the stores on the same memory location; none of the following
 *   loads or stores can be reordered with the load.  Preceding stores to a
 *   different memory location MAY be reordered with the load, but that is not
 *   a violation of the acquire semantics (i.e., the loads and stores of two
 *   critical sections guarded by a different memory location can overlap).
 * - Locked instructions serve as CPU memory barriers by themselves.
 * - Locked stores implement the sequential consistency memory order.  Using
 *   locked instructions when seq_cst memory order is requested allows normal
 *   loads to observe the stores in the same (total) order without using CPU
 *   memory barrier after the loads.
 *
 * NOTE: Some older AMD Opteron processors have a bug that violates the
 * acquire semantics described above.  The bug manifests as an unlocked
 * read-modify-write operation following a "semaphore operation" operating
 * on data that existed before entering the critical section; i.e., the
 * preceding "semaphore operation" fails to function as an acquire barrier.
 * The affected CPUs are AMD family 15, models 32 to 63.
 *
 * Ref. http://support.amd.com/TechDocs/25759.pdf errata #147.
 */

/* Barriers. */

#define compiler_barrier()      asm volatile(" " : : : "memory")
#define cpu_barrier()           asm volatile("mfence;" : : : "memory")

/*
 * The 'volatile' keyword prevents the compiler from keeping the atomic
 * value in a register, and generates a new memory access for each atomic
 * operation.  This allows the implementations of memory_order_relaxed and
 * memory_order_consume to avoid issuing a compiler memory barrier, allowing
 * full optimization of all surrounding non-atomic variables.
 *
 * The placement of the 'volatile' keyword after the 'TYPE' below is highly
 * significant when the TYPE is a pointer type.  In that case we want the
 * pointer to be declared volatile, not the data type that is being pointed
 * at!
 */
#define ATOMIC(TYPE) TYPE volatile

/* Memory ordering.  Must be passed in as a constant. */
typedef enum {
    memory_order_relaxed,
    memory_order_consume,
    memory_order_acquire,
    memory_order_release,
    memory_order_acq_rel,
    memory_order_seq_cst
} memory_order;
\f
#define ATOMIC_BOOL_LOCK_FREE 2
#define ATOMIC_CHAR_LOCK_FREE 2
#define ATOMIC_SHORT_LOCK_FREE 2
#define ATOMIC_INT_LOCK_FREE 2
#define ATOMIC_LONG_LOCK_FREE 2
#define ATOMIC_LLONG_LOCK_FREE 2
#define ATOMIC_POINTER_LOCK_FREE 2

#define IS_LOCKLESS_ATOMIC(OBJECT)                      \
    (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
\f
#define ATOMIC_VAR_INIT(VALUE) VALUE
#define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)

/*
 * The memory_model_relaxed does not need a compiler barrier, if the
 * atomic operation can otherwise be guaranteed to not be moved with
 * respect to other atomic operations on the same memory location.  Using
 * the 'volatile' keyword in the definition of the atomic types
 * accomplishes this, as memory accesses to volatile data may not be
 * optimized away, or be reordered with other volatile accesses.
 *
 * On x86 also memory_order_consume is automatic, and data dependency on a
 * volatile atomic variable means that the compiler optimizations should not
 * cause problems.  That is, the compiler should not speculate the value of
 * the atomic_read, as it is going to read it from the memory anyway.
 * This allows omiting the compiler memory barrier on atomic_reads with
 * memory_order_consume.  This matches the definition of
 * smp_read_barrier_depends() in Linux kernel as a nop for x86, and its usage
 * in rcu_dereference().
 *
 * We use this same logic below to choose inline assembly statements with or
 * without a compiler memory barrier.
 */
static inline void
atomic_compiler_barrier(memory_order order)
{
    if (order > memory_order_consume) {
        compiler_barrier();
    }
}

static inline void
atomic_thread_fence(memory_order order)
{
    if (order == memory_order_seq_cst) {
        cpu_barrier();
    } else {
        atomic_compiler_barrier(order);
    }
}

static inline void
atomic_signal_fence(memory_order order)
{
    atomic_compiler_barrier(order);
}

#define atomic_is_lock_free(OBJ)                \
    ((void) *(OBJ),                             \
     IS_LOCKLESS_ATOMIC(*(OBJ)) ? 2 : 0)

#define atomic_exchange__(DST, SRC, ORDER)        \
    ({                                            \
        typeof(DST) dst___ = (DST);               \
        typeof(*(DST)) src___ = (SRC);            \
                                                  \
        if ((ORDER) > memory_order_consume) {           \
            asm volatile("xchg %1,%0 ; "                \
                         "# atomic_exchange__"          \
                         : "+r" (src___),    /* 0 */    \
                           "+m" (*dst___)    /* 1 */    \
                         :: "memory");                  \
        } else {                                        \
            asm volatile("xchg %1,%0 ; "                \
                         "# atomic_exchange__"          \
                         : "+r" (src___),    /* 0 */    \
                           "+m" (*dst___));  /* 1 */    \
        }                                               \
        src___;                                         \
    })

/* Atomic store: Valid memory models are:
 *
 * memory_order_relaxed, memory_order_release, and
 * memory_order_seq_cst. */
#define atomic_store_explicit(DST, SRC, ORDER)          \
    ({                                                  \
        typeof(DST) dst__ = (DST);                      \
        typeof(*(DST)) src__ = (SRC);                   \
                                                        \
        if ((ORDER) != memory_order_seq_cst) {          \
            atomic_compiler_barrier(ORDER);             \
            *dst__ = src__;                             \
        } else {                                        \
            atomic_exchange__(dst__, src__, ORDER);     \
        }                                               \
        (void) 0;                                       \
    })
#define atomic_store(DST, SRC)                                  \
    atomic_store_explicit(DST, SRC, memory_order_seq_cst)

/* Atomic read: Valid memory models are:
 *
 * memory_order_relaxed, memory_order_consume, memory_model_acquire,
 * and memory_order_seq_cst. */
#define atomic_read_explicit(SRC, DST, ORDER)           \
    ({                                                  \
        typeof(DST) dst__ = (DST);                      \
        typeof(SRC) src__ = (SRC);                      \
                                                        \
        *dst__ = *src__;                                \
        atomic_compiler_barrier(ORDER);                 \
        (void) 0;                                       \
    })
#define atomic_read(SRC, DST)                                   \
    atomic_read_explicit(SRC, DST, memory_order_seq_cst)

#define atomic_compare_exchange__(DST, EXP, SRC, RES, CLOB)           \
    asm volatile("lock; cmpxchg %3,%1 ; "                             \
                 "      sete    %0      "                             \
                 "# atomic_compare_exchange__"                        \
                 : "=q" (RES),           /* 0 */                      \
                   "+m" (*DST),          /* 1 */                      \
                   "+a" (EXP)            /* 2 */                      \
                 : "r" (SRC)             /* 3 */                      \
                 : CLOB, "cc")

/* All memory models are valid for read-modify-write operations.
 *
 * Valid memory models for the read operation of the current value in
 * the failure case are the same as for atomic read, but can not be
 * stronger than the success memory model.
 * ORD_FAIL is ignored, as atomic_compare_exchange__ already implements
 * at least as strong a barrier as allowed for ORD_FAIL in all cases. */
#define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORDER, ORD_FAIL) \
    ({                                                              \
        typeof(DST) dst__ = (DST);                                  \
        typeof(DST) expp__ = (EXP);                                 \
        typeof(*(DST)) src__ = (SRC);                               \
        typeof(*(DST)) exp__ = *expp__;                             \
        uint8_t res__;                                              \
        (void)ORD_FAIL;                                             \
                                                                    \
        if ((ORDER) > memory_order_consume) {                       \
            atomic_compare_exchange__(dst__, exp__, src__, res__,   \
                                      "memory");                    \
        } else {                                                    \
            atomic_compare_exchange__(dst__, exp__, src__, res__,   \
                                      "cc");                        \
        }                                                           \
        if (!res__) {                                               \
            *expp__ = exp__;                                        \
        }                                                           \
        (bool)res__;                                                \
    })
#define atomic_compare_exchange_strong(DST, EXP, SRC)             \
    atomic_compare_exchange_strong_explicit(DST, EXP, SRC,        \
                                            memory_order_seq_cst, \
                                            memory_order_seq_cst)
#define atomic_compare_exchange_weak            \
    atomic_compare_exchange_strong
#define atomic_compare_exchange_weak_explicit   \
    atomic_compare_exchange_strong_explicit

#define atomic_add__(RMW, ARG, CLOB)            \
    asm volatile("lock; xadd %0,%1 ; "          \
                 "# atomic_add__     "          \
                 : "+r" (ARG),       /* 0 */    \
                   "+m" (*RMW)       /* 1 */    \
                 :: CLOB, "cc")

#define atomic_add_explicit(RMW, ARG, ORIG, ORDER)  \
    ({                                              \
        typeof(RMW) rmw__ = (RMW);                  \
        typeof(*(RMW)) arg__ = (ARG);               \
                                                    \
        if ((ORDER) > memory_order_consume) {       \
            atomic_add__(rmw__, arg__, "memory");   \
        } else {                                    \
            atomic_add__(rmw__, arg__, "cc");       \
        }                                           \
        *(ORIG) = arg__;                            \
    })
#define atomic_add(RMW, ARG, ORIG)                              \
    atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

#define atomic_sub_explicit(RMW, ARG, ORIG, ORDER)      \
    atomic_add_explicit(RMW, -(ARG), ORIG, ORDER)
#define atomic_sub(RMW, ARG, ORIG)                              \
    atomic_sub_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

/* We could use simple locked instructions if the original value was not
 * needed. */
#define atomic_op__(RMW, OP, ARG, ORIG, ORDER)              \
    ({                                                      \
        typeof(RMW) rmw__ = (RMW);                          \
        typeof(ARG) arg__ = (ARG);                                      \
                                                                        \
        typeof(*(RMW)) val__;                                           \
                                                                        \
        atomic_read_explicit(rmw__, &val__, memory_order_relaxed);      \
        do {                                                            \
        } while (!atomic_compare_exchange_weak_explicit(rmw__, &val__,  \
                                                        val__ OP arg__, \
                                                        ORDER,          \
                                                        memory_order_relaxed)); \
        *(ORIG) = val__;                                                \
    })

#define atomic_or_explicit(RMW, ARG, ORIG, ORDER)       \
    atomic_op__(RMW, |, ARG, ORIG, ORDER)
#define atomic_or(RMW, ARG, ORIG)                              \
    atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

#define atomic_xor_explicit(RMW, ARG, ORIG, ORDER)      \
    atomic_op__(RMW, ^, ARG, ORIG, ORDER)
#define atomic_xor(RMW, ARG, ORIG)                              \
    atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

#define atomic_and_explicit(RMW, ARG, ORIG, ORDER)      \
    atomic_op__(RMW, &, ARG, ORIG, ORDER)
#define atomic_and(RMW, ARG, ORIG)                              \
    atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)

\f
/* atomic_flag */

typedef ATOMIC(int) atomic_flag;
#define ATOMIC_FLAG_INIT { false }

#define atomic_flag_test_and_set_explicit(FLAG, ORDER)  \
    ((bool)atomic_exchange__(FLAG, 1, ORDER))
#define atomic_flag_test_and_set(FLAG)                                  \
    atomic_flag_test_and_set_explicit(FLAG, memory_order_seq_cst)

#define atomic_flag_clear_explicit(FLAG, ORDER) \
    atomic_store_explicit(FLAG, 0, ORDER)
#define atomic_flag_clear(FLAG)                                 \
    atomic_flag_clear_explicit(FLAG, memory_order_seq_cst)
Commit	Line	Data
f31841d5 JR	1	/*
	2	* Copyright (c) 2014 Nicira, Inc.
	3	*
	4	* Licensed under the Apache License, Version 2.0 (the "License");
	5	* you may not use this file except in compliance with the License.
	6	* You may obtain a copy of the License at:
	7	*
	8	* http://www.apache.org/licenses/LICENSE-2.0
	9	*
	10	* Unless required by applicable law or agreed to in writing, software
	11	* distributed under the License is distributed on an "AS IS" BASIS,
	12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	13	* See the License for the specific language governing permissions and
	14	* limitations under the License.
	15	*/
	16
	17	/* This header implements atomic operation primitives on x86_64 with GCC. */
	18	#ifndef IN_OVS_ATOMIC_H
	19	#error "This header should only be included indirectly via ovs-atomic.h."
	20	#endif
	21
	22	#define OVS_ATOMIC_X86_64_IMPL 1
	23
	24	/*
	25	* x86_64 Memory model (http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.html):
	26	*
	27	* - 1, 2, 4, and 8 byte loads and stores are atomic on aligned memory.
	28	* - Loads are not reordered with other loads.
	29	* - Stores are not reordered with OLDER loads.
	30	* - Loads may be reordered with OLDER stores to a different memory location,
	31	* but not with OLDER stores to the same memory location.
	32	* - Stores are not reordered with other stores, except for special
	33	* instructions (CLFLUSH, streaming stores, fast string operations).
	34	* Most of these are not emitted by compilers, and as long as the
	35	* atomic stores are not combined with any other stores, even the allowed
	36	* reordering of the stores by a single fast string operation (e.g., "stos")
	37	* is not a problem.
	38	* - Neither loads nor stores are reordered with locked instructions.
	39	* - Loads cannot pass earlier LFENCE or MFENCE instructions.
	40	* - Stores cannot pass earlier LFENCE, SFENCE, or MFENCE instructions.
	41	* - LFENCE instruction cannot pass earlier loads.
	42	* - SFENCE instruction cannot pass earlier stores.
	43	* - MFENCE instruction cannot pass earlier loads or stores.
	44	* - Stores by a single processor are observed in the same order by all
	45	* processors.
	46	* - (Unlocked) Stores from different processors are NOT ordered.
	47	* - Memory ordering obeys causality (memory ordering respects transitive
	48	* visibility).
	49	* - Any two stores are seen in a consistent order by processors other than
	50	* the those performing the stores.
	51	* - Locked instructions have total order.
	52	*
	53	* These rules imply that:
	54	*
	55	* - Locked instructions are not needed for aligned loads or stores to make
	56	* them atomic.
	57	* - All stores have release semantics; none of the preceding stores or loads
	58	* can be reordered with following stores. Following loads could still be
	59	* reordered to happen before the store, but that is not a violation of the
	60	* release semantics.
	61	* - All loads from a given memory location have acquire semantics with
	62	* respect to the stores on the same memory location; none of the following
	63	* loads or stores can be reordered with the load. Preceding stores to a
	64	* different memory location MAY be reordered with the load, but that is not
65	* a violation of the acquire semantics (i.e., the loads and stores of two
66	* critical sections guarded by a different memory location can overlap).
67	* - Locked instructions serve as CPU memory barriers by themselves.
68	* - Locked stores implement the sequential consistency memory order. Using
69	* locked instructions when seq_cst memory order is requested allows normal
70	* loads to observe the stores in the same (total) order without using CPU
71	* memory barrier after the loads.
72	*
73	* NOTE: Some older AMD Opteron processors have a bug that violates the
74	* acquire semantics described above. The bug manifests as an unlocked
75	* read-modify-write operation following a "semaphore operation" operating
76	* on data that existed before entering the critical section; i.e., the
77	* preceding "semaphore operation" fails to function as an acquire barrier.
78	* The affected CPUs are AMD family 15, models 32 to 63.
79	*
80	* Ref. http://support.amd.com/TechDocs/25759.pdf errata #147.
81	*/
82
83	/* Barriers. */
84
85	#define compiler_barrier() asm volatile(" " : : : "memory")
86	#define cpu_barrier() asm volatile("mfence;" : : : "memory")
87
88	/*
89	* The 'volatile' keyword prevents the compiler from keeping the atomic
90	* value in a register, and generates a new memory access for each atomic
91	* operation. This allows the implementations of memory_order_relaxed and
92	* memory_order_consume to avoid issuing a compiler memory barrier, allowing
93	* full optimization of all surrounding non-atomic variables.
94	*
95	* The placement of the 'volatile' keyword after the 'TYPE' below is highly
96	* significant when the TYPE is a pointer type. In that case we want the
97	* pointer to be declared volatile, not the data type that is being pointed
98	* at!
99	*/
100	#define ATOMIC(TYPE) TYPE volatile
101
102	/* Memory ordering. Must be passed in as a constant. */
103	typedef enum {
104	memory_order_relaxed,
105	memory_order_consume,
106	memory_order_acquire,
107	memory_order_release,
108	memory_order_acq_rel,
109	memory_order_seq_cst
110	} memory_order;
111	\f
112	#define ATOMIC_BOOL_LOCK_FREE 2
113	#define ATOMIC_CHAR_LOCK_FREE 2
114	#define ATOMIC_SHORT_LOCK_FREE 2
115	#define ATOMIC_INT_LOCK_FREE 2
116	#define ATOMIC_LONG_LOCK_FREE 2
117	#define ATOMIC_LLONG_LOCK_FREE 2
118	#define ATOMIC_POINTER_LOCK_FREE 2
119
120	#define IS_LOCKLESS_ATOMIC(OBJECT) \
121	(sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
122	\f
123	#define ATOMIC_VAR_INIT(VALUE) VALUE
124	#define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)
125
126	/*
127	* The memory_model_relaxed does not need a compiler barrier, if the
128	* atomic operation can otherwise be guaranteed to not be moved with
129	* respect to other atomic operations on the same memory location. Using
130	* the 'volatile' keyword in the definition of the atomic types
131	* accomplishes this, as memory accesses to volatile data may not be
132	* optimized away, or be reordered with other volatile accesses.
133	*
134	* On x86 also memory_order_consume is automatic, and data dependency on a
135	* volatile atomic variable means that the compiler optimizations should not
136	* cause problems. That is, the compiler should not speculate the value of
137	* the atomic_read, as it is going to read it from the memory anyway.
138	* This allows omiting the compiler memory barrier on atomic_reads with
139	* memory_order_consume. This matches the definition of
140	* smp_read_barrier_depends() in Linux kernel as a nop for x86, and its usage
141	* in rcu_dereference().
142	*
143	* We use this same logic below to choose inline assembly statements with or
144	* without a compiler memory barrier.
145	*/
146	static inline void
147	atomic_compiler_barrier(memory_order order)
148	{
149	if (order > memory_order_consume) {
150	compiler_barrier();
151	}
152	}
153
154	static inline void
155	atomic_thread_fence(memory_order order)
156	{
157	if (order == memory_order_seq_cst) {
158	cpu_barrier();
159	} else {
160	atomic_compiler_barrier(order);
161	}
162	}
163
164	static inline void
165	atomic_signal_fence(memory_order order)
166	{
167	atomic_compiler_barrier(order);
168	}
169
170	#define atomic_is_lock_free(OBJ) \
171	((void) *(OBJ), \
172	IS_LOCKLESS_ATOMIC(*(OBJ)) ? 2 : 0)
173
174	#define atomic_exchange__(DST, SRC, ORDER) \
175	({ \
176	typeof(DST) dst___ = (DST); \
0b83904f	177	typeof(*(DST)) src___ = (SRC); \
f31841d5 JR	178	\
	179	if ((ORDER) > memory_order_consume) { \
	180	asm volatile("xchg %1,%0 ; " \
	181	"# atomic_exchange__" \
	182	: "+r" (src___), /* 0 */ \
	183	"+m" (dst___) / 1 */ \
	184	:: "memory"); \
	185	} else { \
	186	asm volatile("xchg %1,%0 ; " \
	187	"# atomic_exchange__" \
	188	: "+r" (src___), /* 0 */ \
	189	"+m" (dst___)); / 1 */ \
	190	} \
	191	src___; \
	192	})
	193
	194	/* Atomic store: Valid memory models are:
	195	*
	196	* memory_order_relaxed, memory_order_release, and
	197	* memory_order_seq_cst. */
	198	#define atomic_store_explicit(DST, SRC, ORDER) \
	199	({ \
	200	typeof(DST) dst__ = (DST); \
0b83904f	201	typeof(*(DST)) src__ = (SRC); \
f31841d5 JR	202	\
	203	if ((ORDER) != memory_order_seq_cst) { \
	204	atomic_compiler_barrier(ORDER); \
	205	*dst__ = src__; \
	206	} else { \
	207	atomic_exchange__(dst__, src__, ORDER); \
	208	} \
	209	(void) 0; \
	210	})
	211	#define atomic_store(DST, SRC) \
	212	atomic_store_explicit(DST, SRC, memory_order_seq_cst)
	213
	214	/* Atomic read: Valid memory models are:
	215	*
	216	* memory_order_relaxed, memory_order_consume, memory_model_acquire,
	217	* and memory_order_seq_cst. */
	218	#define atomic_read_explicit(SRC, DST, ORDER) \
	219	({ \
	220	typeof(DST) dst__ = (DST); \
	221	typeof(SRC) src__ = (SRC); \
	222	\
	223	dst__ = src__; \
	224	atomic_compiler_barrier(ORDER); \
	225	(void) 0; \
	226	})
	227	#define atomic_read(SRC, DST) \
	228	atomic_read_explicit(SRC, DST, memory_order_seq_cst)
	229
	230	#define atomic_compare_exchange__(DST, EXP, SRC, RES, CLOB) \
	231	asm volatile("lock; cmpxchg %3,%1 ; " \
	232	" sete %0 " \
	233	"# atomic_compare_exchange__" \
	234	: "=q" (RES), /* 0 */ \
	235	"+m" (DST), / 1 */ \
	236	"+a" (EXP) /* 2 */ \
	237	: "r" (SRC) /* 3 */ \
	238	: CLOB, "cc")
	239
	240	/* All memory models are valid for read-modify-write operations.
	241	*
	242	* Valid memory models for the read operation of the current value in
	243	* the failure case are the same as for atomic read, but can not be
	244	* stronger than the success memory model.
	245	* ORD_FAIL is ignored, as atomic_compare_exchange__ already implements
	246	* at least as strong a barrier as allowed for ORD_FAIL in all cases. */
	247	#define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORDER, ORD_FAIL) \
	248	({ \
	249	typeof(DST) dst__ = (DST); \
	250	typeof(DST) expp__ = (EXP); \
0b83904f JR	251	typeof(*(DST)) src__ = (SRC); \
0b83904f JR	252	typeof((DST)) exp__ = expp__; \
f31841d5 JR	253	uint8_t res__; \
	254	(void)ORD_FAIL; \
	255	\
	256	if ((ORDER) > memory_order_consume) { \
	257	atomic_compare_exchange__(dst__, exp__, src__, res__, \
	258	"memory"); \
	259	} else { \
	260	atomic_compare_exchange__(dst__, exp__, src__, res__, \
	261	"cc"); \
	262	} \
	263	if (!res__) { \
	264	*expp__ = exp__; \
	265	} \
	266	(bool)res__; \
	267	})
	268	#define atomic_compare_exchange_strong(DST, EXP, SRC) \
	269	atomic_compare_exchange_strong_explicit(DST, EXP, SRC, \
	270	memory_order_seq_cst, \
	271	memory_order_seq_cst)
	272	#define atomic_compare_exchange_weak \
	273	atomic_compare_exchange_strong
	274	#define atomic_compare_exchange_weak_explicit \
	275	atomic_compare_exchange_strong_explicit
	276
	277	#define atomic_add__(RMW, ARG, CLOB) \
	278	asm volatile("lock; xadd %0,%1 ; " \
	279	"# atomic_add__ " \
	280	: "+r" (ARG), /* 0 */ \
	281	"+m" (RMW) / 1 */ \
	282	:: CLOB, "cc")
	283
	284	#define atomic_add_explicit(RMW, ARG, ORIG, ORDER) \
	285	({ \
	286	typeof(RMW) rmw__ = (RMW); \
0b83904f	287	typeof(*(RMW)) arg__ = (ARG); \
f31841d5 JR	288	\
	289	if ((ORDER) > memory_order_consume) { \
	290	atomic_add__(rmw__, arg__, "memory"); \
	291	} else { \
	292	atomic_add__(rmw__, arg__, "cc"); \
	293	} \
	294	*(ORIG) = arg__; \
	295	})
	296	#define atomic_add(RMW, ARG, ORIG) \
	297	atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	298
	299	#define atomic_sub_explicit(RMW, ARG, ORIG, ORDER) \
	300	atomic_add_explicit(RMW, -(ARG), ORIG, ORDER)
	301	#define atomic_sub(RMW, ARG, ORIG) \
	302	atomic_sub_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	303
	304	/* We could use simple locked instructions if the original value was not
	305	* needed. */
	306	#define atomic_op__(RMW, OP, ARG, ORIG, ORDER) \
	307	({ \
	308	typeof(RMW) rmw__ = (RMW); \
	309	typeof(ARG) arg__ = (ARG); \
	310	\
0b83904f	311	typeof(*(RMW)) val__; \
f31841d5 JR	312	\
	313	atomic_read_explicit(rmw__, &val__, memory_order_relaxed); \
	314	do { \
	315	} while (!atomic_compare_exchange_weak_explicit(rmw__, &val__, \
	316	val__ OP arg__, \
	317	ORDER, \
	318	memory_order_relaxed)); \
	319	*(ORIG) = val__; \
	320	})
	321
	322	#define atomic_or_explicit(RMW, ARG, ORIG, ORDER) \
	323	atomic_op__(RMW, \|, ARG, ORIG, ORDER)
0b83904f	324	#define atomic_or(RMW, ARG, ORIG) \
f31841d5 JR	325	atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	326
	327	#define atomic_xor_explicit(RMW, ARG, ORIG, ORDER) \
	328	atomic_op__(RMW, ^, ARG, ORIG, ORDER)
	329	#define atomic_xor(RMW, ARG, ORIG) \
	330	atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	331
	332	#define atomic_and_explicit(RMW, ARG, ORIG, ORDER) \
	333	atomic_op__(RMW, &, ARG, ORIG, ORDER)
	334	#define atomic_and(RMW, ARG, ORIG) \
	335	atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
	336
	337	\f
	338	/* atomic_flag */
	339
	340	typedef ATOMIC(int) atomic_flag;
	341	#define ATOMIC_FLAG_INIT { false }
	342
	343	#define atomic_flag_test_and_set_explicit(FLAG, ORDER) \
	344	((bool)atomic_exchange__(FLAG, 1, ORDER))
	345	#define atomic_flag_test_and_set(FLAG) \
	346	atomic_flag_test_and_set_explicit(FLAG, memory_order_seq_cst)
	347
	348	#define atomic_flag_clear_explicit(FLAG, ORDER) \
	349	atomic_store_explicit(FLAG, 0, ORDER)
	350	#define atomic_flag_clear(FLAG) \
	351	atomic_flag_clear_explicit(FLAG, memory_order_seq_cst)