[ceph.git] / ceph / src / spdk / dpdk / lib / librte_distributor / distributor_private.h

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2017 Intel Corporation
 */

#ifndef _DIST_PRIV_H_
#define _DIST_PRIV_H_

/**
 * @file
 * RTE distributor
 *
 * The distributor is a component which is designed to pass packets
 * one-at-a-time to workers, with dynamic load balancing.
 */

#ifdef __cplusplus
extern "C" {
#endif

#define NO_FLAGS 0
#define RTE_DISTRIB_PREFIX "DT_"

/*
 * We will use the bottom four bits of pointer for flags, shifting out
 * the top four bits to make room (since a 64-bit pointer actually only uses
 * 48 bits). An arithmetic-right-shift will then appropriately restore the
 * original pointer value with proper sign extension into the top bits.
 */
#define RTE_DISTRIB_FLAG_BITS 4
#define RTE_DISTRIB_FLAGS_MASK (0x0F)
#define RTE_DISTRIB_NO_BUF 0       /**< empty flags: no buffer requested */
#define RTE_DISTRIB_GET_BUF (1)    /**< worker requests a buffer, returns old */
#define RTE_DISTRIB_RETURN_BUF (2) /**< worker returns a buffer, no request */
#define RTE_DISTRIB_VALID_BUF (4)  /**< set if bufptr contains ptr */

#define RTE_DISTRIB_BACKLOG_SIZE 8
#define RTE_DISTRIB_BACKLOG_MASK (RTE_DISTRIB_BACKLOG_SIZE - 1)

#define RTE_DISTRIB_MAX_RETURNS 128
#define RTE_DISTRIB_RETURNS_MASK (RTE_DISTRIB_MAX_RETURNS - 1)

/**
 * Maximum number of workers allowed.
 * Be aware of increasing the limit, because it is limited by how we track
 * in-flight tags. See in_flight_bitmask and rte_distributor_process
 */
#define RTE_DISTRIB_MAX_WORKERS 64

#define RTE_DISTRIBUTOR_NAMESIZE 32 /**< Length of name for instance */

/**
 * Buffer structure used to pass the pointer data between cores. This is cache
 * line aligned, but to improve performance and prevent adjacent cache-line
 * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
 * the next cache line to worker 0, we pad this out to three cache lines.
 * Only 64-bits of the memory is actually used though.
 */
union rte_distributor_buffer_single {
	volatile int64_t bufptr64;
	char pad[RTE_CACHE_LINE_SIZE*3];
} __rte_cache_aligned;

/*
 * Transfer up to 8 mbufs at a time to/from workers, and
 * flow matching algorithm optimized for 8 flow IDs at a time
 */
#define RTE_DIST_BURST_SIZE 8

struct rte_distributor_backlog {
	unsigned int start;
	unsigned int count;
	int64_t pkts[RTE_DIST_BURST_SIZE] __rte_cache_aligned;
	uint16_t *tags; /* will point to second cacheline of inflights */
} __rte_cache_aligned;


struct rte_distributor_returned_pkts {
	unsigned int start;
	unsigned int count;
	struct rte_mbuf *mbufs[RTE_DISTRIB_MAX_RETURNS];
};

struct rte_distributor_single {
	TAILQ_ENTRY(rte_distributor_single) next;    /**< Next in list. */

	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
	unsigned int num_workers;             /**< Number of workers polling */

	uint32_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS];
		/**< Tracks the tag being processed per core */
	uint64_t in_flight_bitmask;
		/**< on/off bits for in-flight tags.
		 * Note that if RTE_DISTRIB_MAX_WORKERS is larger than 64 then
		 * the bitmask has to expand.
		 */

	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS];

	union rte_distributor_buffer_single bufs[RTE_DISTRIB_MAX_WORKERS];

	struct rte_distributor_returned_pkts returns;
};

/* All different signature compare functions */
enum rte_distributor_match_function {
	RTE_DIST_MATCH_SCALAR = 0,
	RTE_DIST_MATCH_VECTOR,
	RTE_DIST_NUM_MATCH_FNS
};

/**
 * Buffer structure used to pass the pointer data between cores. This is cache
 * line aligned, but to improve performance and prevent adjacent cache-line
 * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
 * the next cache line to worker 0, we pad this out to two cache lines.
 * We can pass up to 8 mbufs at a time in one cacheline.
 * There is a separate cacheline for returns in the burst API.
 */
struct rte_distributor_buffer {
	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
		__rte_cache_aligned; /* <= outgoing to worker */

	int64_t pad1 __rte_cache_aligned;    /* <= one cache line  */

	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
		__rte_cache_aligned; /* <= incoming from worker */

	int64_t pad2 __rte_cache_aligned;    /* <= one cache line  */

	int count __rte_cache_aligned;       /* <= number of current mbufs */
};

struct rte_distributor {
	TAILQ_ENTRY(rte_distributor) next;    /**< Next in list. */

	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
	unsigned int num_workers;             /**< Number of workers polling */
	unsigned int alg_type;                /**< Number of alg types */

	/**>
	 * First cache line in the this array are the tags inflight
	 * on the worker core. Second cache line are the backlog
	 * that are going to go to the worker core.
	 */
	uint16_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2]
			__rte_cache_aligned;

	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]
			__rte_cache_aligned;

	struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS];

	struct rte_distributor_returned_pkts returns;

	enum rte_distributor_match_function dist_match_fn;

	struct rte_distributor_single *d_single;
};

void
find_match_scalar(struct rte_distributor *d,
			uint16_t *data_ptr,
			uint16_t *output_ptr);

void
find_match_vec(struct rte_distributor *d,
			uint16_t *data_ptr,
			uint16_t *output_ptr);

#ifdef __cplusplus
}
#endif

#endif /* _DIST_PRIV_H_ */
Commit	Line	Data
11fdf7f2 TL	1	/* SPDX-License-Identifier: BSD-3-Clause
	2	* Copyright(c) 2017 Intel Corporation
	3	*/
	4
f67539c2 TL	5	#ifndef _DIST_PRIV_H_
f67539c2 TL	6	#define _DIST_PRIV_H_
11fdf7f2 TL	7
	8	/**
	9	* @file
	10	* RTE distributor
	11	*
	12	* The distributor is a component which is designed to pass packets
	13	* one-at-a-time to workers, with dynamic load balancing.
	14	*/
	15
	16	#ifdef __cplusplus
	17	extern "C" {
	18	#endif
	19
	20	#define NO_FLAGS 0
	21	#define RTE_DISTRIB_PREFIX "DT_"
	22
	23	/*
	24	* We will use the bottom four bits of pointer for flags, shifting out
	25	* the top four bits to make room (since a 64-bit pointer actually only uses
	26	* 48 bits). An arithmetic-right-shift will then appropriately restore the
	27	* original pointer value with proper sign extension into the top bits.
	28	*/
	29	#define RTE_DISTRIB_FLAG_BITS 4
	30	#define RTE_DISTRIB_FLAGS_MASK (0x0F)
	31	#define RTE_DISTRIB_NO_BUF 0 /*< empty flags: no buffer requested /
	32	#define RTE_DISTRIB_GET_BUF (1) /*< worker requests a buffer, returns old /
	33	#define RTE_DISTRIB_RETURN_BUF (2) /*< worker returns a buffer, no request /
	34	#define RTE_DISTRIB_VALID_BUF (4) /*< set if bufptr contains ptr /
	35
	36	#define RTE_DISTRIB_BACKLOG_SIZE 8
	37	#define RTE_DISTRIB_BACKLOG_MASK (RTE_DISTRIB_BACKLOG_SIZE - 1)
	38
	39	#define RTE_DISTRIB_MAX_RETURNS 128
	40	#define RTE_DISTRIB_RETURNS_MASK (RTE_DISTRIB_MAX_RETURNS - 1)
	41
	42	/**
	43	* Maximum number of workers allowed.
9f95a23c	44	* Be aware of increasing the limit, because it is limited by how we track
11fdf7f2 TL	45	* in-flight tags. See in_flight_bitmask and rte_distributor_process
	46	*/
	47	#define RTE_DISTRIB_MAX_WORKERS 64
	48
	49	#define RTE_DISTRIBUTOR_NAMESIZE 32 /*< Length of name for instance /
	50
	51	/**
	52	* Buffer structure used to pass the pointer data between cores. This is cache
	53	* line aligned, but to improve performance and prevent adjacent cache-line
	54	* prefetches of buffers for other workers, e.g. when worker 1's buffer is on
	55	* the next cache line to worker 0, we pad this out to three cache lines.
	56	* Only 64-bits of the memory is actually used though.
	57	*/
f67539c2	58	union rte_distributor_buffer_single {
11fdf7f2 TL	59	volatile int64_t bufptr64;
	60	char pad[RTE_CACHE_LINE_SIZE*3];
	61	} __rte_cache_aligned;
	62
	63	/*
	64	* Transfer up to 8 mbufs at a time to/from workers, and
	65	* flow matching algorithm optimized for 8 flow IDs at a time
	66	*/
	67	#define RTE_DIST_BURST_SIZE 8
	68
	69	struct rte_distributor_backlog {
	70	unsigned int start;
	71	unsigned int count;
	72	int64_t pkts[RTE_DIST_BURST_SIZE] __rte_cache_aligned;
	73	uint16_t tags; / will point to second cacheline of inflights */
	74	} __rte_cache_aligned;
	75
	76
	77	struct rte_distributor_returned_pkts {
	78	unsigned int start;
	79	unsigned int count;
	80	struct rte_mbuf *mbufs[RTE_DISTRIB_MAX_RETURNS];
	81	};
	82
f67539c2 TL	83	struct rte_distributor_single {
f67539c2 TL	84	TAILQ_ENTRY(rte_distributor_single) next; /*< Next in list. /
11fdf7f2 TL	85
	86	char name[RTE_DISTRIBUTOR_NAMESIZE]; /*< Name of the ring. /
	87	unsigned int num_workers; /*< Number of workers polling /
	88
	89	uint32_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS];
	90	/*< Tracks the tag being processed per core /
	91	uint64_t in_flight_bitmask;
	92	/**< on/off bits for in-flight tags.
	93	* Note that if RTE_DISTRIB_MAX_WORKERS is larger than 64 then
	94	* the bitmask has to expand.
	95	*/
	96
	97	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS];
	98
f67539c2	99	union rte_distributor_buffer_single bufs[RTE_DISTRIB_MAX_WORKERS];
11fdf7f2 TL	100
	101	struct rte_distributor_returned_pkts returns;
	102	};
	103
	104	/* All different signature compare functions */
	105	enum rte_distributor_match_function {
	106	RTE_DIST_MATCH_SCALAR = 0,
	107	RTE_DIST_MATCH_VECTOR,
	108	RTE_DIST_NUM_MATCH_FNS
	109	};
	110
	111	/**
	112	* Buffer structure used to pass the pointer data between cores. This is cache
	113	* line aligned, but to improve performance and prevent adjacent cache-line
	114	* prefetches of buffers for other workers, e.g. when worker 1's buffer is on
	115	* the next cache line to worker 0, we pad this out to two cache lines.
	116	* We can pass up to 8 mbufs at a time in one cacheline.
	117	* There is a separate cacheline for returns in the burst API.
	118	*/
	119	struct rte_distributor_buffer {
	120	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
	121	__rte_cache_aligned; /* <= outgoing to worker */
	122
	123	int64_t pad1 __rte_cache_aligned; /* <= one cache line */
	124
	125	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
	126	__rte_cache_aligned; /* <= incoming from worker */
	127
	128	int64_t pad2 __rte_cache_aligned; /* <= one cache line */
	129
	130	int count __rte_cache_aligned; /* <= number of current mbufs */
	131	};
	132
	133	struct rte_distributor {
	134	TAILQ_ENTRY(rte_distributor) next; /*< Next in list. /
	135
	136	char name[RTE_DISTRIBUTOR_NAMESIZE]; /*< Name of the ring. /
	137	unsigned int num_workers; /*< Number of workers polling /
	138	unsigned int alg_type; /*< Number of alg types /
	139
	140	/**>
	141	* First cache line in the this array are the tags inflight
	142	* on the worker core. Second cache line are the backlog
	143	* that are going to go to the worker core.
	144	*/
	145	uint16_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2]
	146	__rte_cache_aligned;
	147
	148	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]
	149	__rte_cache_aligned;
	150
	151	struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS];
	152
	153	struct rte_distributor_returned_pkts returns;
	154
	155	enum rte_distributor_match_function dist_match_fn;
	156
f67539c2	157	struct rte_distributor_single *d_single;
11fdf7f2 TL	158	};
	159
	160	void
	161	find_match_scalar(struct rte_distributor *d,
	162	uint16_t *data_ptr,
	163	uint16_t *output_ptr);
	164
	165	void
	166	find_match_vec(struct rte_distributor *d,
	167	uint16_t *data_ptr,
	168	uint16_t *output_ptr);
	169
	170	#ifdef __cplusplus
	171	}
	172	#endif
	173
f67539c2	174	#endif /* _DIST_PRIV_H_ */