See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
document for further documentation.
+To maximize the number of tests passing, the .config of the kernel
+under test should match the config file fragment in
+tools/testing/selftests/bpf as closely as possible.
+
+Finally to ensure support for latest BPF Type Format features -
+discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16
+is required for kernels built with CONFIG_DEBUG_INFO_BTF=y.
+pahole is delivered in the dwarves package or can be built
+from source at
+
+https://github.com/acmel/dwarves
+
+Some distros have pahole version 1.16 packaged already, e.g.
+Fedora, Gentoo.
+
Q: Which BPF kernel selftests version should I run my kernel against?
---------------------------------------------------------------------
A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
+++ /dev/null
-Mediatek pericfg controller
-===========================
-
-The Mediatek pericfg controller provides various clocks and reset
-outputs to the system.
-
-Required Properties:
-
-- compatible: Should be one of:
- - "mediatek,mt2701-pericfg", "syscon"
- - "mediatek,mt2712-pericfg", "syscon"
- - "mediatek,mt7622-pericfg", "syscon"
- - "mediatek,mt7623-pericfg", "mediatek,mt2701-pericfg", "syscon"
- - "mediatek,mt7629-pericfg", "syscon"
- - "mediatek,mt8135-pericfg", "syscon"
- - "mediatek,mt8173-pericfg", "syscon"
- - "mediatek,mt8183-pericfg", "syscon"
-- #clock-cells: Must be 1
-- #reset-cells: Must be 1
-
-The pericfg controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-Also it uses the common reset controller binding from
-Documentation/devicetree/bindings/reset/reset.txt.
-The available reset outputs are defined in
-dt-bindings/reset/mt*-resets.h
-
-Example:
-
-pericfg: power-controller@10003000 {
- compatible = "mediatek,mt8173-pericfg", "syscon";
- reg = <0 0x10003000 0 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
-};
--- /dev/null
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/mediatek/mediatek,pericfg.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: MediaTek Peripheral Configuration Controller
+
+maintainers:
+ - Bartosz Golaszewski <bgolaszewski@baylibre.com>
+
+description:
+ The Mediatek pericfg controller provides various clocks and reset outputs
+ to the system.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - mediatek,mt2701-pericfg
+ - mediatek,mt2712-pericfg
+ - mediatek,mt7622-pericfg
+ - mediatek,mt7629-pericfg
+ - mediatek,mt8135-pericfg
+ - mediatek,mt8173-pericfg
+ - mediatek,mt8183-pericfg
+ - mediatek,mt8516-pericfg
+ - const: syscon
+ - items:
+ # Special case for mt7623 for backward compatibility
+ - const: mediatek,mt7623-pericfg
+ - const: mediatek,mt2701-pericfg
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ pericfg@10003000 {
+ compatible = "mediatek,mt8173-pericfg", "syscon";
+ reg = <0x10003000 0x1000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
+
+ - |
+ pericfg@10003000 {
+ compatible = "mediatek,mt7623-pericfg", "mediatek,mt2701-pericfg", "syscon";
+ reg = <0x10003000 0x1000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
--- /dev/null
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/mediatek,eth-mac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek STAR Ethernet MAC Controller
+
+maintainers:
+ - Bartosz Golaszewski <bgolaszewski@baylibre.com>
+
+description:
+ This Ethernet MAC is used on the MT8* family of SoCs from MediaTek.
+ It's compliant with 802.3 standards and supports half- and full-duplex
+ modes with flow-control as well as CRC offloading and VLAN tags.
+
+allOf:
+ - $ref: "ethernet-controller.yaml#"
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt8516-eth
+ - mediatek,mt8518-eth
+ - mediatek,mt8175-eth
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 3
+ maxItems: 3
+
+ clock-names:
+ additionalItems: false
+ items:
+ - const: core
+ - const: reg
+ - const: trans
+
+ mediatek,pericfg:
+ $ref: /schemas/types.yaml#definitions/phandle
+ description:
+ Phandle to the device containing the PERICFG register range. This is used
+ to control the MII mode.
+
+ mdio:
+ type: object
+ description:
+ Creates and registers an MDIO bus.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - mediatek,pericfg
+ - phy-handle
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8516-clk.h>
+
+ ethernet: ethernet@11180000 {
+ compatible = "mediatek,mt8516-eth";
+ reg = <0x11180000 0x1000>;
+ mediatek,pericfg = <&pericfg>;
+ interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>;
+ clocks = <&topckgen CLK_TOP_RG_ETH>,
+ <&topckgen CLK_TOP_66M_ETH>,
+ <&topckgen CLK_TOP_133M_ETH>;
+ clock-names = "core", "reg", "trans";
+ phy-handle = <ð_phy>;
+ phy-mode = "rmii";
+
+ mdio {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ eth_phy: ethernet-phy@0 {
+ reg = <0>;
+ };
+ };
+ };
L: netdev@vger.kernel.org
L: bpf@vger.kernel.org
S: Maintained
-F: kernel/bpf/xskmap.c
+F: include/net/xdp_sock*
+F: include/net/xsk_buffer_pool.h
+F: include/uapi/linux/if_xdp.h
F: net/xdp/
+F: samples/bpf/xdpsock*
+F: tools/lib/bpf/xsk*
XEN BLOCK SUBSYSTEM
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
#clock-cells = <1>;
};
+ pericfg: pericfg@10003050 {
+ compatible = "mediatek,mt8516-pericfg", "syscon";
+ reg = <0 0x10003050 0 0x1000>;
+ };
+
apmixedsys: apmixedsys@10018000 {
compatible = "mediatek,mt8516-apmixedsys", "syscon";
reg = <0 0x10018000 0 0x710>;
status = "disabled";
};
+ ethernet: ethernet@11180000 {
+ compatible = "mediatek,mt8516-eth";
+ reg = <0 0x11180000 0 0x1000>;
+ mediatek,pericfg = <&pericfg>;
+ interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>;
+ clocks = <&topckgen CLK_TOP_RG_ETH>,
+ <&topckgen CLK_TOP_66M_ETH>,
+ <&topckgen CLK_TOP_133M_ETH>;
+ clock-names = "core", "reg", "trans";
+ status = "disabled";
+ };
+
rng: rng@1020c000 {
compatible = "mediatek,mt8516-rng",
"mediatek,mt7623-rng";
/ {
aliases {
serial0 = &uart0;
+ ethernet0 = ðernet;
};
chosen {
status = "okay";
};
+ðernet {
+ pinctrl-names = "default";
+ pinctrl-0 = <ðernet_pins_default>;
+ phy-handle = <ð_phy>;
+ phy-mode = "rmii";
+ mac-address = [00 00 00 00 00 00];
+ status = "okay";
+
+ mdio {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ eth_phy: ethernet-phy@0 {
+ reg = <0>;
+ };
+ };
+};
+
&usb0 {
status = "okay";
dr_mode = "peripheral";
bias-pull-up;
};
};
+
+ ethernet_pins_default: ethernet {
+ pins_ethernet {
+ pinmux = <MT8516_PIN_0_EINT0__FUNC_EXT_TXD0>,
+ <MT8516_PIN_1_EINT1__FUNC_EXT_TXD1>,
+ <MT8516_PIN_5_EINT5__FUNC_EXT_RXER>,
+ <MT8516_PIN_6_EINT6__FUNC_EXT_RXC>,
+ <MT8516_PIN_7_EINT7__FUNC_EXT_RXDV>,
+ <MT8516_PIN_8_EINT8__FUNC_EXT_RXD0>,
+ <MT8516_PIN_9_EINT9__FUNC_EXT_RXD1>,
+ <MT8516_PIN_12_EINT12__FUNC_EXT_TXEN>,
+ <MT8516_PIN_38_MRG_DI__FUNC_EXT_MDIO>,
+ <MT8516_PIN_39_MRG_DO__FUNC_EXT_MDC>;
+ };
+ };
};
ENA_ADMIN_OS_DPDK = 3,
ENA_ADMIN_OS_FREEBSD = 4,
ENA_ADMIN_OS_IPXE = 5,
- ENA_ADMIN_OS_ESXI = 6,
- ENA_ADMIN_OS_GROUPS_NUM = 6,
+ ENA_ADMIN_OS_ESXI = 6,
+ ENA_ADMIN_OS_GROUPS_NUM = 6,
};
struct ena_admin_host_info {
u16 reserved;
- /* 1 :0 : reserved
+ /* 0 : reserved
+ * 1 : rx_offset
* 2 : interrupt_moderation
* 31:3 : reserved
*/
#define ENA_ADMIN_HOST_INFO_DEVICE_MASK GENMASK(7, 3)
#define ENA_ADMIN_HOST_INFO_BUS_SHIFT 8
#define ENA_ADMIN_HOST_INFO_BUS_MASK GENMASK(15, 8)
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT 1
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK BIT(1)
#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT 2
#define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK BIT(2)
/* aenq_link_change_desc */
#define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK BIT(0)
-#endif /*_ENA_ADMIN_H_ */
+#endif /* _ENA_ADMIN_H_ */
#define ENA_REGS_ADMIN_INTR_MASK 1
-#define ENA_POLL_MS 5
+#define ENA_MIN_ADMIN_POLL_US 100
+
+#define ENA_MAX_ADMIN_POLL_US 5000
/*****************************************************************************/
/*****************************************************************************/
static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue,
u16 command_id, bool capture)
{
- if (unlikely(!queue->comp_ctx)) {
- pr_err("Completion context is NULL\n");
- return NULL;
- }
-
if (unlikely(command_id >= queue->q_depth)) {
pr_err("command id is larger than the queue size. cmd_id: %u queue size %d\n",
command_id, queue->q_depth);
return NULL;
}
+ if (unlikely(!queue->comp_ctx)) {
+ pr_err("Completion context is NULL\n");
+ return NULL;
+ }
+
if (unlikely(queue->comp_ctx[command_id].occupied && capture)) {
pr_err("Completion context is occupied\n");
return NULL;
io_sq->bounce_buf_ctrl.next_to_use = 0;
size = io_sq->bounce_buf_ctrl.buffer_size *
- io_sq->bounce_buf_ctrl.buffers_num;
+ io_sq->bounce_buf_ctrl.buffers_num;
dev_node = dev_to_node(ena_dev->dmadev);
set_dev_node(ena_dev->dmadev, ctx->numa_node);
if (unlikely(comp_status != 0))
pr_err("admin command failed[%u]\n", comp_status);
- if (unlikely(comp_status > ENA_ADMIN_UNKNOWN_ERROR))
- return -EINVAL;
-
switch (comp_status) {
case ENA_ADMIN_SUCCESS:
return 0;
return -EINVAL;
}
- return 0;
+ return -EINVAL;
+}
+
+static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us)
+{
+ delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us);
+ delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US);
+ usleep_range(delay_us, 2 * delay_us);
}
static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
unsigned long flags = 0;
unsigned long timeout;
int ret;
+ u32 exp = 0;
timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
goto err;
}
- msleep(ENA_POLL_MS);
+ ena_delay_exponential_backoff_us(exp++,
+ admin_queue->ena_dev->ena_min_poll_delay_us);
}
if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
/* The desc list entry size should be whole multiply of 8
* This requirement comes from __iowrite64_copy()
*/
- pr_err("illegal entry size %d\n",
- llq_info->desc_list_entry_size);
+ pr_err("illegal entry size %d\n", llq_info->desc_list_entry_size);
return -EINVAL;
}
if (admin_queue->auto_polling)
admin_queue->polling = true;
} else {
- pr_err("The ena device doesn't send a completion for the admin cmd %d status %d\n",
+ pr_err("The ena device didn't send a completion for the admin cmd %d status %d\n",
comp_ctx->cmd_opcode, comp_ctx->status);
}
/* Check if shifted to polling mode.
static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
u16 exp_state)
{
- u32 val, i;
+ u32 val, exp = 0;
+ unsigned long timeout_stamp;
- /* Convert timeout from resolution of 100ms to ENA_POLL_MS */
- timeout = (timeout * 100) / ENA_POLL_MS;
+ /* Convert timeout from resolution of 100ms to us resolution. */
+ timeout_stamp = jiffies + usecs_to_jiffies(100 * 1000 * timeout);
- for (i = 0; i < timeout; i++) {
+ while (1) {
val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) {
exp_state)
return 0;
- msleep(ENA_POLL_MS);
- }
+ if (time_is_before_jiffies(timeout_stamp))
+ return -ETIME;
- return -ETIME;
+ ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us);
+ }
}
static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev,
static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev,
u16 intr_delay_resolution)
{
- /* Initial value of intr_delay_resolution might be 0 */
- u16 prev_intr_delay_resolution =
- ena_dev->intr_delay_resolution ?
- ena_dev->intr_delay_resolution :
- ENA_DEFAULT_INTR_DELAY_RESOLUTION;
+ u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution;
- if (!intr_delay_resolution) {
+ if (unlikely(!intr_delay_resolution)) {
pr_err("Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n");
intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
}
{
struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
unsigned long flags = 0;
+ u32 exp = 0;
spin_lock_irqsave(&admin_queue->q_lock, flags);
while (atomic_read(&admin_queue->outstanding_cmds) != 0) {
spin_unlock_irqrestore(&admin_queue->q_lock, flags);
- msleep(ENA_POLL_MS);
+ ena_delay_exponential_backoff_us(exp++,
+ ena_dev->ena_min_poll_delay_us);
spin_lock_irqsave(&admin_queue->q_lock, flags);
}
spin_unlock_irqrestore(&admin_queue->q_lock, flags);
if (ret)
goto error;
+ admin_queue->ena_dev = ena_dev;
admin_queue->running_state = true;
return 0;
struct ena_admin_aenq_entry *aenq_e;
struct ena_admin_aenq_common_desc *aenq_common;
struct ena_com_aenq *aenq = &dev->aenq;
- unsigned long long timestamp;
+ u64 timestamp;
ena_aenq_handler handler_cb;
u16 masked_head, processed = 0;
u8 phase;
*/
dma_rmb();
- timestamp =
- (unsigned long long)aenq_common->timestamp_low |
- ((unsigned long long)aenq_common->timestamp_high << 32);
+ timestamp = (u64)aenq_common->timestamp_low |
+ ((u64)aenq_common->timestamp_high << 32);
pr_debug("AENQ! Group[%x] Syndrom[%x] timestamp: [%llus]\n",
aenq_common->group, aenq_common->syndrom, timestamp);
/* write the aenq doorbell after all AENQ descriptors were read */
mb();
- writel_relaxed((u32)aenq->head,
- dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+ writel_relaxed((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
}
int ena_com_dev_reset(struct ena_com_dev *ena_dev,
enum ena_admin_hash_functions func,
const u8 *key, u16 key_len, u32 init_val)
{
- struct ena_rss *rss = &ena_dev->rss;
+ struct ena_admin_feature_rss_flow_hash_control *hash_key;
struct ena_admin_get_feat_resp get_resp;
- struct ena_admin_feature_rss_flow_hash_control *hash_key =
- rss->hash_key;
enum ena_admin_hash_functions old_func;
+ struct ena_rss *rss = &ena_dev->rss;
int rc;
+ hash_key = rss->hash_key;
+
/* Make sure size is a mult of DWs */
if (unlikely(key_len & 0x3))
return -EINVAL;
if (unlikely(rc))
return rc;
- if (!((1 << func) & get_resp.u.flow_hash_func.supported_func)) {
+ if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) {
pr_err("Flow hash function %d isn't supported\n", func);
return -EOPNOTSUPP;
}
#define ENA_INTR_INITIAL_RX_INTERVAL_USECS 0
#define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1
+#define ENA_HASH_KEY_SIZE 40
+
#define ENA_HW_HINTS_NO_TIMEOUT 0xFFFF
#define ENA_FEATURE_MAX_QUEUE_EXT_VER 1
struct ena_com_admin_queue {
void *q_dmadev;
+ struct ena_com_dev *ena_dev;
spinlock_t q_lock; /* spinlock for the admin queue */
struct ena_comp_ctx *comp_ctx;
struct ena_intr_moder_entry *intr_moder_tbl;
struct ena_com_llq_info llq_info;
+
+ u32 ena_min_poll_delay_us;
};
struct ena_com_dev_get_features_ctx {
*/
int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev);
-/* ena_com_set_mmio_read_mode - Enable/disable the mmio reg read mechanism
+/* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism
* @ena_dev: ENA communication layer struct
* @readless_supported: readless mode (enable/disable)
*/
/* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler
* @ena_dev: ENA communication layer struct
*
- * This method go over the admin completion queue and wake up all the pending
+ * This method goes over the admin completion queue and wakes up all the pending
* threads that wait on the commands wait event.
*
* @note: Should be called after MSI-X interrupt.
/* ena_com_aenq_intr_handler - AENQ interrupt handler
* @ena_dev: ENA communication layer struct
*
- * This method go over the async event notification queue and call the proper
+ * This method goes over the async event notification queue and calls the proper
* aenq handler.
*/
void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data);
/* ena_com_wait_for_abort_completion - Wait for admin commands abort.
* @ena_dev: ENA communication layer struct
*
- * This method wait until all the outstanding admin commands will be completed.
+ * This method waits until all the outstanding admin commands are completed.
*/
void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev);
/* ena_com_validate_version - Validate the device parameters
* @ena_dev: ENA communication layer struct
*
- * This method validate the device parameters are the same as the saved
+ * This method verifies the device parameters are the same as the saved
* parameters in ena_dev.
* This method is useful after device reset, to validate the device mac address
* and the device offloads are the same as before the reset.
*
* Retrieve the hash function from the device.
*
- * @note: If the caller called ena_com_fill_hash_function but didn't flash
+ * @note: If the caller called ena_com_fill_hash_function but didn't flush
* it to the device, the new configuration will be lost.
*
* @return: 0 on Success and negative value otherwise.
*
* Retrieve the hash key.
*
- * @note: If the caller called ena_com_fill_hash_key but didn't flash
+ * @note: If the caller called ena_com_fill_hash_key but didn't flush
* it to the device, the new configuration will be lost.
*
* @return: 0 on Success and negative value otherwise.
*
* Retrieve the hash control from the device.
*
- * @note, If the caller called ena_com_fill_hash_ctrl but didn't flash
+ * @note: If the caller called ena_com_fill_hash_ctrl but didn't flush
* it to the device, the new configuration will be lost.
*
* @return: 0 on Success and negative value otherwise.
*
* Retrieve the RSS indirection table from the device.
*
- * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flash
+ * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flush
* it to the device, the new configuration will be lost.
*
* @return: 0 on Success and negative value otherwise.
/* ena_com_delete_debug_area - Free the debug area resources.
* @ena_dev: ENA communication layer struct
*
- * Free the allocate debug area.
+ * Free the allocated debug area.
*/
void ena_com_delete_debug_area(struct ena_com_dev *ena_dev);
/* ena_com_delete_host_info - Free the host info resources.
* @ena_dev: ENA communication layer struct
*
- * Free the allocate host info.
+ * Free the allocated host info.
*/
void ena_com_delete_host_info(struct ena_com_dev *ena_dev);
* @cmd_completion: command completion return value.
* @cmd_comp_size: command completion size.
- * Submit an admin command and then wait until the device will return a
+ * Submit an admin command and then wait until the device returns a
* completion.
- * The completion will be copyed into cmd_comp.
+ * The completion will be copied into cmd_comp.
*
* @return - 0 on success, negative value on failure.
*/
/* ena_com_config_dev_mode - Configure the placement policy of the device.
* @ena_dev: ENA communication layer struct
* @llq_features: LLQ feature descriptor, retrieve via
- * ena_com_get_dev_attr_feat.
+ * ena_com_get_dev_attr_feat.
* @ena_llq_config: The default driver LLQ parameters configurations
*/
int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
* @intr_reg: interrupt register to update.
* @rx_delay_interval: Rx interval in usecs
* @tx_delay_interval: Tx interval in usecs
- * @unmask: unask enable/disable
+ * @unmask: unmask enable/disable
*
* Prepare interrupt update register with the supplied parameters.
*/
u16 reserved16;
};
-#endif /*_ENA_COMMON_H_ */
+#endif /* _ENA_COMMON_H_ */
struct ena_eth_io_rx_cdesc_base *cdesc = NULL;
u16 cdesc_idx = 0;
u16 nb_hw_desc;
- u16 i;
+ u16 i = 0;
WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
return -ENOSPC;
}
- for (i = 0; i < nb_hw_desc; i++) {
+ cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx);
+ ena_rx_ctx->pkt_offset = cdesc->offset;
+
+ do {
+ ena_buf[i].len = cdesc->length;
+ ena_buf[i].req_id = cdesc->req_id;
+
+ if (++i >= nb_hw_desc)
+ break;
+
cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i);
- ena_buf->len = cdesc->length;
- ena_buf->req_id = cdesc->req_id;
- ena_buf++;
- }
+ } while (1);
/* Update SQ head ptr */
io_sq->next_to_comp += nb_hw_desc;
desc->length = ena_buf->len;
- desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK;
- desc->ctrl |= ENA_ETH_IO_RX_DESC_LAST_MASK;
- desc->ctrl |= io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK;
- desc->ctrl |= ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
+ desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK |
+ ENA_ETH_IO_RX_DESC_LAST_MASK |
+ (io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK) |
+ ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
desc->req_id = req_id;
u32 hash;
u16 descs;
int max_bufs;
+ u8 pkt_offset;
};
int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
writel(intr_reg->intr_control, io_cq->unmask_reg);
}
-static inline int ena_com_free_desc(struct ena_com_io_sq *io_sq)
+static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq)
{
u16 tail, next_to_comp, cnt;
int temp;
if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
- return ena_com_free_desc(io_sq) >= required_buffers;
+ return ena_com_free_q_entries(io_sq) >= required_buffers;
/* This calculation doesn't need to be 100% accurate. So to reduce
* the calculation overhead just Subtract 2 lines from the free descs
*/
temp = required_buffers / io_sq->llq_info.descs_per_entry + 2;
- return ena_com_free_desc(io_sq) > temp;
+ return ena_com_free_q_entries(io_sq) > temp;
}
static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq,
u16 sub_qid;
- u16 reserved;
+ u8 offset;
+
+ u8 reserved;
};
/* 8-word format */
#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT 31
#define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK BIT(31)
-#endif /*_ENA_ETH_IO_H_ */
+#endif /* _ENA_ETH_IO_H_ */
if (sset != ETH_SS_STATS)
return -EOPNOTSUPP;
- return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+ return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+ ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
}
for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
ena_stats = &ena_stats_global_strings[i];
-
memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
data += ETH_GSTRING_LEN;
}
struct ena_adapter *adapter = netdev_priv(net_dev);
struct ena_com_dev *ena_dev = adapter->ena_dev;
- if (!ena_com_interrupt_moderation_supported(ena_dev)) {
- /* the devie doesn't support interrupt moderation */
+ if (!ena_com_interrupt_moderation_supported(ena_dev))
return -EOPNOTSUPP;
- }
coalesce->tx_coalesce_usecs =
ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) *
return 0;
}
-static void ena_update_tx_rings_intr_moderation(struct ena_adapter *adapter)
+static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
{
unsigned int val;
int i;
adapter->tx_ring[i].smoothed_interval = val;
}
-static void ena_update_rx_rings_intr_moderation(struct ena_adapter *adapter)
+static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
{
unsigned int val;
int i;
struct ena_com_dev *ena_dev = adapter->ena_dev;
int rc;
- if (!ena_com_interrupt_moderation_supported(ena_dev)) {
- /* the devie doesn't support interrupt moderation */
+ if (!ena_com_interrupt_moderation_supported(ena_dev))
return -EOPNOTSUPP;
- }
rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev,
coalesce->tx_coalesce_usecs);
if (rc)
return rc;
- ena_update_tx_rings_intr_moderation(adapter);
+ ena_update_tx_rings_nonadaptive_intr_moderation(adapter);
rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
coalesce->rx_coalesce_usecs);
if (rc)
return rc;
- ena_update_rx_rings_intr_moderation(adapter);
+ ena_update_rx_rings_nonadaptive_intr_moderation(adapter);
if (coalesce->use_adaptive_rx_coalesce &&
!ena_com_get_adaptive_moderation_enabled(ena_dev))
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
rx_info->page_offset, len, ENA_PAGE_SIZE);
+ /* The offset is non zero only for the first buffer */
+ rx_info->page_offset = 0;
netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
"rx skb updated. len %d. data_len %d\n",
{
u16 next_to_clean = rx_ring->next_to_clean;
struct ena_com_rx_ctx ena_rx_ctx;
+ struct ena_rx_buffer *rx_info;
struct ena_adapter *adapter;
u32 res_budget, work_done;
int rx_copybreak_pkt = 0;
ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
ena_rx_ctx.max_bufs = rx_ring->sgl_size;
ena_rx_ctx.descs = 0;
+ ena_rx_ctx.pkt_offset = 0;
rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq,
rx_ring->ena_com_io_sq,
&ena_rx_ctx);
if (unlikely(ena_rx_ctx.descs == 0))
break;
+ rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
+ rx_info->page_offset = ena_rx_ctx.pkt_offset;
+
netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
"rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
rx_ring->next_to_clean = next_to_clean;
- refill_required = ena_com_free_desc(rx_ring->ena_com_io_sq);
+ refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
refill_threshold =
min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
ENA_RX_REFILL_THRESH_PACKET);
rc = ena_rss_init_default(adapter);
if (rc && (rc != -EOPNOTSUPP)) {
netif_err(adapter, ifup, adapter->netdev,
- "Failed to init RSS rc: %d\n", rc);
+ "Failed to init RSS rc: %d\n", rc);
return rc;
}
}
if (rc) {
netif_err(adapter, ifup, adapter->netdev,
"Failed to create I/O TX queue num %d rc: %d\n",
- qid, rc);
+ qid, rc);
return rc;
}
* ones due to past queue allocation failures.
*/
set_io_rings_size(adapter, adapter->requested_tx_ring_size,
- adapter->requested_rx_ring_size);
+ adapter->requested_rx_ring_size);
while (1) {
if (ena_xdp_present(adapter)) {
if (rc != -ENOMEM) {
netif_err(adapter, ifup, adapter->netdev,
"Queue creation failed with error code %d\n",
- rc);
+ rc);
return rc;
}
new_rx_ring_size = cur_rx_ring_size / 2;
if (new_tx_ring_size < ENA_MIN_RING_SIZE ||
- new_rx_ring_size < ENA_MIN_RING_SIZE) {
+ new_rx_ring_size < ENA_MIN_RING_SIZE) {
netif_err(adapter, ifup, adapter->netdev,
"Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n",
ENA_MIN_RING_SIZE);
return qid;
}
-static void ena_config_host_info(struct ena_com_dev *ena_dev,
- struct pci_dev *pdev)
+static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
{
struct ena_admin_host_info *host_info;
int rc;
host_info->num_cpus = num_online_cpus();
host_info->driver_supported_features =
+ ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK;
rc = ena_com_set_host_attributes(ena_dev);
for (i = 0; i < adapter->num_io_queues; i++) {
rx_ring = &adapter->rx_ring[i];
- refill_required =
- ena_com_free_desc(rx_ring->ena_com_io_sq);
+ refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
rx_ring->empty_rx_queue++;
mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
}
-static int ena_calc_max_io_queue_num(struct pci_dev *pdev,
+static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev,
struct ena_com_dev *ena_dev,
struct ena_com_dev_get_features_ctx *get_feat_ctx)
{
- int io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues;
+ u32 io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues;
if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
struct ena_admin_queue_ext_feature_fields *max_queue_ext =
*/
static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
- struct ena_com_dev_get_features_ctx get_feat_ctx;
struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
+ struct ena_com_dev_get_features_ctx get_feat_ctx;
struct ena_llq_configurations llq_config;
struct ena_com_dev *ena_dev = NULL;
struct ena_adapter *adapter;
goto err_free_region;
}
+ ena_dev->ena_min_poll_delay_us = ENA_ADMIN_POLL_DELAY_US;
+
ena_dev->dmadev = &pdev->dev;
rc = ena_device_init(ena_dev, pdev, &get_feat_ctx, &wd_state);
calc_queue_ctx.get_feat_ctx = &get_feat_ctx;
calc_queue_ctx.pdev = pdev;
- /* Initial Tx and RX interrupt delay. Assumes 1 usec granularity.
+ /* Initial TX and RX interrupt delay. Assumes 1 usec granularity.
* Updated during device initialization with the real granularity
*/
ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS;
adapter->num_io_queues = max_num_io_queues;
adapter->max_num_io_queues = max_num_io_queues;
+ adapter->last_monitored_tx_qid = 0;
adapter->xdp_first_ring = 0;
adapter->xdp_num_queues = 0;
- adapter->last_monitored_tx_qid = 0;
-
adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK;
adapter->wd_state = wd_state;
#define DRV_MODULE_GEN_SUBMINOR 0
#define DRV_MODULE_NAME "ena"
-#ifndef DRV_MODULE_GENERATION
-#define DRV_MODULE_GENERATION \
- __stringify(DRV_MODULE_GEN_MAJOR) "." \
- __stringify(DRV_MODULE_GEN_MINOR) "." \
- __stringify(DRV_MODULE_GEN_SUBMINOR) "K"
-#endif
#define DEVICE_NAME "Elastic Network Adapter (ENA)"
#define ENA_RX_RSS_TABLE_LOG_SIZE 7
#define ENA_RX_RSS_TABLE_SIZE (1 << ENA_RX_RSS_TABLE_LOG_SIZE)
-#define ENA_HASH_KEY_SIZE 40
-
/* The number of tx packet completions that will be handled each NAPI poll
* cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER.
*/
#define ENA_IO_IRQ_FIRST_IDX 1
#define ENA_IO_IRQ_IDX(q) (ENA_IO_IRQ_FIRST_IDX + (q))
+#define ENA_ADMIN_POLL_DELAY_US 100
+
/* ENA device should send keep alive msg every 1 sec.
* We wait for 6 sec just to be on the safe side.
*/
#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT 16
#define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK 0xffff0000
-#endif /*_ENA_REGS_H_ */
+#endif /* _ENA_REGS_H_ */
"InDroppedDma",
};
-static const char aq_ethtool_queue_stat_names[][ETH_GSTRING_LEN] = {
- "Queue[%d] InPackets",
- "Queue[%d] OutPackets",
- "Queue[%d] Restarts",
- "Queue[%d] InJumboPackets",
- "Queue[%d] InLroPackets",
- "Queue[%d] InErrors",
+static const char * const aq_ethtool_queue_stat_names[] = {
+ "%sQueue[%d] InPackets",
+ "%sQueue[%d] OutPackets",
+ "%sQueue[%d] Restarts",
+ "%sQueue[%d] InJumboPackets",
+ "%sQueue[%d] InLroPackets",
+ "%sQueue[%d] InErrors",
};
#if IS_ENABLED(CONFIG_MACSEC)
struct aq_nic_s *nic = netdev_priv(ndev);
struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(nic);
u32 n_stats = ARRAY_SIZE(aq_ethtool_stat_names) +
- ARRAY_SIZE(aq_ethtool_queue_stat_names) * cfg->vecs;
+ ARRAY_SIZE(aq_ethtool_queue_stat_names) * cfg->vecs *
+ cfg->tcs;
#if IS_ENABLED(CONFIG_MACSEC)
if (nic->macsec_cfg) {
static void aq_ethtool_get_strings(struct net_device *ndev,
u32 stringset, u8 *data)
{
- struct aq_nic_s *aq_nic = netdev_priv(ndev);
+ struct aq_nic_s *nic = netdev_priv(ndev);
struct aq_nic_cfg_s *cfg;
u8 *p = data;
int i, si;
int sa;
#endif
- cfg = aq_nic_get_cfg(aq_nic);
+ cfg = aq_nic_get_cfg(nic);
switch (stringset) {
- case ETH_SS_STATS:
+ case ETH_SS_STATS: {
+ const int stat_cnt = ARRAY_SIZE(aq_ethtool_queue_stat_names);
+ char tc_string[8];
+ int tc;
+
+ memset(tc_string, 0, sizeof(tc_string));
memcpy(p, aq_ethtool_stat_names,
sizeof(aq_ethtool_stat_names));
p = p + sizeof(aq_ethtool_stat_names);
- for (i = 0; i < cfg->vecs; i++) {
- for (si = 0;
- si < ARRAY_SIZE(aq_ethtool_queue_stat_names);
- si++) {
- snprintf(p, ETH_GSTRING_LEN,
- aq_ethtool_queue_stat_names[si], i);
- p += ETH_GSTRING_LEN;
+
+ for (tc = 0; tc < cfg->tcs; tc++) {
+ if (cfg->is_qos)
+ snprintf(tc_string, 8, "TC%d ", tc);
+
+ for (i = 0; i < cfg->vecs; i++) {
+ for (si = 0; si < stat_cnt; si++) {
+ snprintf(p, ETH_GSTRING_LEN,
+ aq_ethtool_queue_stat_names[si],
+ tc_string,
+ AQ_NIC_CFG_TCVEC2RING(cfg, tc, i));
+ p += ETH_GSTRING_LEN;
+ }
}
}
#if IS_ENABLED(CONFIG_MACSEC)
- if (!aq_nic->macsec_cfg)
+ if (!nic->macsec_cfg)
break;
memcpy(p, aq_macsec_stat_names, sizeof(aq_macsec_stat_names));
for (i = 0; i < AQ_MACSEC_MAX_SC; i++) {
struct aq_macsec_txsc *aq_txsc;
- if (!(test_bit(i, &aq_nic->macsec_cfg->txsc_idx_busy)))
+ if (!(test_bit(i, &nic->macsec_cfg->txsc_idx_busy)))
continue;
for (si = 0;
aq_macsec_txsc_stat_names[si], i);
p += ETH_GSTRING_LEN;
}
- aq_txsc = &aq_nic->macsec_cfg->aq_txsc[i];
+ aq_txsc = &nic->macsec_cfg->aq_txsc[i];
for (sa = 0; sa < MACSEC_NUM_AN; sa++) {
if (!(test_bit(sa, &aq_txsc->tx_sa_idx_busy)))
continue;
for (i = 0; i < AQ_MACSEC_MAX_SC; i++) {
struct aq_macsec_rxsc *aq_rxsc;
- if (!(test_bit(i, &aq_nic->macsec_cfg->rxsc_idx_busy)))
+ if (!(test_bit(i, &nic->macsec_cfg->rxsc_idx_busy)))
continue;
- aq_rxsc = &aq_nic->macsec_cfg->aq_rxsc[i];
+ aq_rxsc = &nic->macsec_cfg->aq_rxsc[i];
for (sa = 0; sa < MACSEC_NUM_AN; sa++) {
if (!(test_bit(sa, &aq_rxsc->rx_sa_idx_busy)))
continue;
}
#endif
break;
+ }
case ETH_SS_PRIV_FLAGS:
memcpy(p, aq_ethtool_priv_flag_names,
sizeof(aq_ethtool_priv_flag_names));
dev_close(ndev);
}
- aq_nic_free_vectors(aq_nic);
-
cfg->rxds = max(ring->rx_pending, hw_caps->rxds_min);
cfg->rxds = min(cfg->rxds, hw_caps->rxds_max);
cfg->rxds = ALIGN(cfg->rxds, AQ_HW_RXD_MULTIPLE);
cfg->txds = min(cfg->txds, hw_caps->txds_max);
cfg->txds = ALIGN(cfg->txds, AQ_HW_TXD_MULTIPLE);
- for (aq_nic->aq_vecs = 0; aq_nic->aq_vecs < cfg->vecs;
- aq_nic->aq_vecs++) {
- aq_nic->aq_vec[aq_nic->aq_vecs] =
- aq_vec_alloc(aq_nic, aq_nic->aq_vecs, cfg);
- if (unlikely(!aq_nic->aq_vec[aq_nic->aq_vecs])) {
- err = -ENOMEM;
- goto err_exit;
- }
- }
+ err = aq_nic_realloc_vectors(aq_nic);
+ if (err)
+ goto err_exit;
+
if (ndev_running)
err = dev_open(ndev, NULL);
struct aq_hw_rx_fltrs_s *rx_fltrs,
struct ethtool_rx_flow_spec *fsp)
{
+ struct aq_nic_cfg_s *cfg = &aq_nic->aq_nic_cfg;
+
if (fsp->location < AQ_RX_FIRST_LOC_FVLANID ||
fsp->location > AQ_RX_LAST_LOC_FVLANID) {
netdev_err(aq_nic->ndev,
return -EINVAL;
}
- if (fsp->ring_cookie > aq_nic->aq_nic_cfg.num_rss_queues) {
+ if (fsp->ring_cookie > cfg->num_rss_queues * cfg->tcs) {
netdev_err(aq_nic->ndev,
"ethtool: queue number must be in range [0, %d]",
- aq_nic->aq_nic_cfg.num_rss_queues - 1);
+ cfg->num_rss_queues * cfg->tcs - 1);
return -EINVAL;
}
return 0;
aq_rule_is_not_correct(struct aq_nic_s *aq_nic,
struct ethtool_rx_flow_spec *fsp)
{
+ struct aq_nic_cfg_s *cfg = &aq_nic->aq_nic_cfg;
bool rule_is_not_correct = false;
if (!aq_nic) {
} else if (aq_check_filter(aq_nic, fsp)) {
rule_is_not_correct = true;
} else if (fsp->ring_cookie != RX_CLS_FLOW_DISC) {
- if (fsp->ring_cookie >= aq_nic->aq_nic_cfg.num_rss_queues) {
+ if (fsp->ring_cookie >= cfg->num_rss_queues * cfg->tcs) {
netdev_err(aq_nic->ndev,
"ethtool: The specified action is invalid.\n"
"Maximum allowable value action is %u.\n",
- aq_nic->aq_nic_cfg.num_rss_queues - 1);
+ cfg->num_rss_queues * cfg->tcs - 1);
rule_is_not_correct = true;
}
}
#define AQ_HW_MAC_COUNTER_HZ 312500000ll
#define AQ_HW_PHY_COUNTER_HZ 160000000ll
+enum aq_tc_mode {
+ AQ_TC_MODE_INVALID = -1,
+ AQ_TC_MODE_8TCS,
+ AQ_TC_MODE_4TCS,
+};
+
#define AQ_RX_FIRST_LOC_FVLANID 0U
#define AQ_RX_LAST_LOC_FVLANID 15U
#define AQ_RX_FIRST_LOC_FETHERT 16U
(AQ_RX_LAST_LOC_FVLANID - AQ_RX_FIRST_LOC_FVLANID + 1U)
#define AQ_RX_QUEUE_NOT_ASSIGNED 0xFFU
+/* Used for rate to Mbps conversion */
+#define AQ_MBPS_DIVISOR 125000 /* 1000000 / 8 */
+
/* NIC H/W capabilities */
struct aq_hw_caps_s {
u64 hw_features;
u32 mac_regs_count;
u32 hw_alive_check_addr;
u8 msix_irqs;
- u8 tcs;
+ u8 tcs_max;
u8 rxd_alignment;
u8 rxd_size;
u8 txd_alignment;
#define AQ_HW_TXD_MULTIPLE 8U
#define AQ_HW_RXD_MULTIPLE 8U
+#define AQ_HW_QUEUES_MAX 32U
#define AQ_HW_MULTICAST_ADDRESS_MAX 32U
+#define AQ_HW_PTP_TC 2U
+
#define AQ_HW_LED_BLINK 0x2U
#define AQ_HW_LED_DEFAULT 0x0U
int (*hw_rss_hash_set)(struct aq_hw_s *self,
struct aq_rss_parameters *rss_params);
+ int (*hw_tc_rate_limit_set)(struct aq_hw_s *self);
+
int (*hw_get_regs)(struct aq_hw_s *self,
const struct aq_hw_caps_s *aq_hw_caps,
u32 *regs_buff);
int (*hw_set_offload)(struct aq_hw_s *self,
struct aq_nic_cfg_s *aq_nic_cfg);
- int (*hw_tx_tc_mode_get)(struct aq_hw_s *self, u32 *tc_mode);
-
- int (*hw_rx_tc_mode_get)(struct aq_hw_s *self, u32 *tc_mode);
-
int (*hw_ring_hwts_rx_fill)(struct aq_hw_s *self,
struct aq_ring_s *aq_ring);
err_exit:
return err;
}
+
+int aq_hw_num_tcs(struct aq_hw_s *hw)
+{
+ switch (hw->aq_nic_cfg->tc_mode) {
+ case AQ_TC_MODE_8TCS:
+ return 8;
+ case AQ_TC_MODE_4TCS:
+ return 4;
+ default:
+ break;
+ }
+
+ return 1;
+}
+
+int aq_hw_q_per_tc(struct aq_hw_s *hw)
+{
+ switch (hw->aq_nic_cfg->tc_mode) {
+ case AQ_TC_MODE_8TCS:
+ return 4;
+ case AQ_TC_MODE_4TCS:
+ return 8;
+ default:
+ return 4;
+ }
+}
void aq_hw_write_reg(struct aq_hw_s *hw, u32 reg, u32 value);
u64 aq_hw_read_reg64(struct aq_hw_s *hw, u32 reg);
int aq_hw_err_from_flags(struct aq_hw_s *hw);
+int aq_hw_num_tcs(struct aq_hw_s *hw);
+int aq_hw_q_per_tc(struct aq_hw_s *hw);
#endif /* AQ_HW_UTILS_H */
set_bit(txsc_idx, &cfg->txsc_idx_busy);
- return 0;
+ return ret;
}
static int aq_mdo_upd_secy(struct macsec_context *ctx)
#include "aq_ethtool.h"
#include "aq_ptp.h"
#include "aq_filters.h"
+#include "aq_hw_utils.h"
#include <linux/netdevice.h>
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/udp.h>
+#include <net/pkt_cls.h>
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(AQ_CFG_DRV_AUTHOR);
struct net_device *ndev = NULL;
struct aq_nic_s *aq_nic = NULL;
- ndev = alloc_etherdev_mq(sizeof(struct aq_nic_s), AQ_CFG_VECS_MAX);
+ ndev = alloc_etherdev_mq(sizeof(struct aq_nic_s), AQ_HW_QUEUES_MAX);
if (!ndev)
return NULL;
return 0;
}
+static int aq_validate_mqprio_opt(struct aq_nic_s *self,
+ struct tc_mqprio_qopt_offload *mqprio,
+ const unsigned int num_tc)
+{
+ const bool has_min_rate = !!(mqprio->flags & TC_MQPRIO_F_MIN_RATE);
+ struct aq_nic_cfg_s *aq_nic_cfg = aq_nic_get_cfg(self);
+ const unsigned int tcs_max = min_t(u8, aq_nic_cfg->aq_hw_caps->tcs_max,
+ AQ_CFG_TCS_MAX);
+
+ if (num_tc > tcs_max) {
+ netdev_err(self->ndev, "Too many TCs requested\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (num_tc != 0 && !is_power_of_2(num_tc)) {
+ netdev_err(self->ndev, "TC count should be power of 2\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (has_min_rate && !ATL_HW_IS_CHIP_FEATURE(self->aq_hw, ANTIGUA)) {
+ netdev_err(self->ndev, "Min tx rate is not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int aq_ndo_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ struct tc_mqprio_qopt_offload *mqprio = type_data;
+ struct aq_nic_s *aq_nic = netdev_priv(dev);
+ bool has_min_rate;
+ bool has_max_rate;
+ int err;
+ int i;
+
+ if (type != TC_SETUP_QDISC_MQPRIO)
+ return -EOPNOTSUPP;
+
+ has_min_rate = !!(mqprio->flags & TC_MQPRIO_F_MIN_RATE);
+ has_max_rate = !!(mqprio->flags & TC_MQPRIO_F_MAX_RATE);
+
+ err = aq_validate_mqprio_opt(aq_nic, mqprio, mqprio->qopt.num_tc);
+ if (err)
+ return err;
+
+ for (i = 0; i < mqprio->qopt.num_tc; i++) {
+ if (has_max_rate) {
+ u64 max_rate = mqprio->max_rate[i];
+
+ do_div(max_rate, AQ_MBPS_DIVISOR);
+ aq_nic_setup_tc_max_rate(aq_nic, i, (u32)max_rate);
+ }
+
+ if (has_min_rate) {
+ u64 min_rate = mqprio->min_rate[i];
+
+ do_div(min_rate, AQ_MBPS_DIVISOR);
+ aq_nic_setup_tc_min_rate(aq_nic, i, (u32)min_rate);
+ }
+ }
+
+ return aq_nic_setup_tc_mqprio(aq_nic, mqprio->qopt.num_tc,
+ mqprio->qopt.prio_tc_map);
+}
+
static const struct net_device_ops aq_ndev_ops = {
.ndo_open = aq_ndev_open,
.ndo_stop = aq_ndev_close,
.ndo_do_ioctl = aq_ndev_ioctl,
.ndo_vlan_rx_add_vid = aq_ndo_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = aq_ndo_vlan_rx_kill_vid,
+ .ndo_setup_tc = aq_ndo_setup_tc,
};
static int __init aq_ndev_init_module(void)
#include <linux/ip.h>
#include <linux/tcp.h>
#include <net/ip.h>
+#include <net/pkt_cls.h>
static unsigned int aq_itr = AQ_CFG_INTERRUPT_MODERATION_AUTO;
module_param_named(aq_itr, aq_itr, uint, 0644);
rss_params->indirection_table[i] = i & (num_rss_queues - 1);
}
+/* Recalculate the number of vectors */
+static void aq_nic_cfg_update_num_vecs(struct aq_nic_s *self)
+{
+ struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+
+ cfg->vecs = min(cfg->aq_hw_caps->vecs, AQ_CFG_VECS_DEF);
+ cfg->vecs = min(cfg->vecs, num_online_cpus());
+ if (self->irqvecs > AQ_HW_SERVICE_IRQS)
+ cfg->vecs = min(cfg->vecs, self->irqvecs - AQ_HW_SERVICE_IRQS);
+ /* cfg->vecs should be power of 2 for RSS */
+ cfg->vecs = rounddown_pow_of_two(cfg->vecs);
+
+ if (ATL_HW_IS_CHIP_FEATURE(self->aq_hw, ANTIGUA)) {
+ if (cfg->tcs > 2)
+ cfg->vecs = min(cfg->vecs, 4U);
+ }
+
+ if (cfg->vecs <= 4)
+ cfg->tc_mode = AQ_TC_MODE_8TCS;
+ else
+ cfg->tc_mode = AQ_TC_MODE_4TCS;
+
+ /*rss rings */
+ cfg->num_rss_queues = min(cfg->vecs, AQ_CFG_NUM_RSS_QUEUES_DEF);
+ aq_nic_rss_init(self, cfg->num_rss_queues);
+}
+
/* Checks hw_caps and 'corrects' aq_nic_cfg in runtime */
void aq_nic_cfg_start(struct aq_nic_s *self)
{
struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+ int i;
cfg->tcs = AQ_CFG_TCS_DEF;
cfg->rxpageorder = AQ_CFG_RX_PAGEORDER;
cfg->is_rss = AQ_CFG_IS_RSS_DEF;
- cfg->num_rss_queues = AQ_CFG_NUM_RSS_QUEUES_DEF;
cfg->aq_rss.base_cpu_number = AQ_CFG_RSS_BASE_CPU_NUM_DEF;
cfg->fc.req = AQ_CFG_FC_MODE;
cfg->wol = AQ_CFG_WOL_MODES;
cfg->is_autoneg = AQ_CFG_IS_AUTONEG_DEF;
cfg->is_lro = AQ_CFG_IS_LRO_DEF;
+ cfg->is_ptp = true;
/*descriptors */
cfg->rxds = min(cfg->aq_hw_caps->rxds_max, AQ_CFG_RXDS_DEF);
cfg->txds = min(cfg->aq_hw_caps->txds_max, AQ_CFG_TXDS_DEF);
- /*rss rings */
- cfg->vecs = min(cfg->aq_hw_caps->vecs, AQ_CFG_VECS_DEF);
- cfg->vecs = min(cfg->vecs, num_online_cpus());
- if (self->irqvecs > AQ_HW_SERVICE_IRQS)
- cfg->vecs = min(cfg->vecs, self->irqvecs - AQ_HW_SERVICE_IRQS);
- /* cfg->vecs should be power of 2 for RSS */
- if (cfg->vecs >= 8U)
- cfg->vecs = 8U;
- else if (cfg->vecs >= 4U)
- cfg->vecs = 4U;
- else if (cfg->vecs >= 2U)
- cfg->vecs = 2U;
- else
- cfg->vecs = 1U;
-
- cfg->num_rss_queues = min(cfg->vecs, AQ_CFG_NUM_RSS_QUEUES_DEF);
-
- aq_nic_rss_init(self, cfg->num_rss_queues);
+ aq_nic_cfg_update_num_vecs(self);
cfg->irq_type = aq_pci_func_get_irq_type(self);
cfg->is_vlan_rx_strip = !!(cfg->features & NETIF_F_HW_VLAN_CTAG_RX);
cfg->is_vlan_tx_insert = !!(cfg->features & NETIF_F_HW_VLAN_CTAG_TX);
cfg->is_vlan_force_promisc = true;
+
+ for (i = 0; i < sizeof(cfg->prio_tc_map); i++)
+ cfg->prio_tc_map[i] = cfg->tcs * i / 8;
}
static int aq_nic_update_link_status(struct aq_nic_s *self)
#if IS_ENABLED(CONFIG_MACSEC)
aq_macsec_enable(self);
#endif
+ if (self->aq_hw_ops->hw_tc_rate_limit_set)
+ self->aq_hw_ops->hw_tc_rate_limit_set(self->aq_hw);
+
netif_tx_wake_all_queues(self->ndev);
}
if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) {
err = aq_phy_init(self->aq_hw);
}
- for (i = 0U, aq_vec = self->aq_vec[0];
- self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
+ for (i = 0U; i < self->aq_vecs; i++) {
+ aq_vec = self->aq_vec[i];
+ err = aq_vec_ring_alloc(aq_vec, self, i,
+ aq_nic_get_cfg(self));
+ if (err)
+ goto err_exit;
+
aq_vec_init(aq_vec, self->aq_hw_ops, self->aq_hw);
+ }
- err = aq_ptp_init(self, self->irqvecs - 1);
- if (err < 0)
- goto err_exit;
+ if (aq_nic_get_cfg(self)->is_ptp) {
+ err = aq_ptp_init(self, self->irqvecs - 1);
+ if (err < 0)
+ goto err_exit;
- err = aq_ptp_ring_alloc(self);
- if (err < 0)
- goto err_exit;
+ err = aq_ptp_ring_alloc(self);
+ if (err < 0)
+ goto err_exit;
- err = aq_ptp_ring_init(self);
- if (err < 0)
- goto err_exit;
+ err = aq_ptp_ring_init(self);
+ if (err < 0)
+ goto err_exit;
+ }
netif_carrier_off(self->ndev);
int aq_nic_start(struct aq_nic_s *self)
{
struct aq_vec_s *aq_vec = NULL;
+ struct aq_nic_cfg_s *cfg;
unsigned int i = 0U;
int err = 0;
+ cfg = aq_nic_get_cfg(self);
+
err = self->aq_hw_ops->hw_multicast_list_set(self->aq_hw,
self->mc_list.ar,
self->mc_list.count);
timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0);
aq_nic_service_timer_cb(&self->service_timer);
- if (self->aq_nic_cfg.is_polling) {
+ if (cfg->is_polling) {
timer_setup(&self->polling_timer, aq_nic_polling_timer_cb, 0);
mod_timer(&self->polling_timer, jiffies +
AQ_CFG_POLLING_TIMER_INTERVAL);
if (err < 0)
goto err_exit;
- if (self->aq_nic_cfg.link_irq_vec) {
+ if (cfg->link_irq_vec) {
int irqvec = pci_irq_vector(self->pdev,
- self->aq_nic_cfg.link_irq_vec);
+ cfg->link_irq_vec);
err = request_threaded_irq(irqvec, NULL,
aq_linkstate_threaded_isr,
IRQF_SHARED | IRQF_ONESHOT,
self->ndev->name, self);
if (err < 0)
goto err_exit;
- self->msix_entry_mask |= (1 << self->aq_nic_cfg.link_irq_vec);
+ self->msix_entry_mask |= (1 << cfg->link_irq_vec);
}
err = self->aq_hw_ops->hw_irq_enable(self->aq_hw,
goto err_exit;
}
- err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs);
+ err = netif_set_real_num_tx_queues(self->ndev,
+ self->aq_vecs * cfg->tcs);
if (err < 0)
goto err_exit;
- err = netif_set_real_num_rx_queues(self->ndev, self->aq_vecs);
+ err = netif_set_real_num_rx_queues(self->ndev,
+ self->aq_vecs * cfg->tcs);
if (err < 0)
goto err_exit;
+ for (i = 0; i < cfg->tcs; i++) {
+ u16 offset = self->aq_vecs * i;
+
+ netdev_set_tc_queue(self->ndev, i, self->aq_vecs, offset);
+ }
netif_tx_start_all_queues(self->ndev);
err_exit:
struct aq_ring_s *ring)
{
unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
+ struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self);
+ struct device *dev = aq_nic_get_dev(self);
struct aq_ring_buff_s *first = NULL;
u8 ipver = ip_hdr(skb)->version;
struct aq_ring_buff_s *dx_buff;
need_context_tag = true;
}
- if (self->aq_nic_cfg.is_vlan_tx_insert && skb_vlan_tag_present(skb)) {
+ if (cfg->is_vlan_tx_insert && skb_vlan_tag_present(skb)) {
dx_buff->vlan_tx_tag = skb_vlan_tag_get(skb);
dx_buff->len_pkt = skb->len;
dx_buff->is_vlan = 1U;
}
dx_buff->len = skb_headlen(skb);
- dx_buff->pa = dma_map_single(aq_nic_get_dev(self),
+ dx_buff->pa = dma_map_single(dev,
skb->data,
dx_buff->len,
DMA_TO_DEVICE);
- if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) {
+ if (unlikely(dma_mapping_error(dev, dx_buff->pa))) {
ret = 0;
goto exit;
}
else
buff_size = frag_len;
- frag_pa = skb_frag_dma_map(aq_nic_get_dev(self),
+ frag_pa = skb_frag_dma_map(dev,
frag,
buff_offset,
buff_size,
DMA_TO_DEVICE);
- if (unlikely(dma_mapping_error(aq_nic_get_dev(self),
+ if (unlikely(dma_mapping_error(dev,
frag_pa)))
goto mapping_error;
if (!(dx_buff->is_gso_tcp || dx_buff->is_gso_udp) &&
!dx_buff->is_vlan && dx_buff->pa) {
if (unlikely(dx_buff->is_sop)) {
- dma_unmap_single(aq_nic_get_dev(self),
+ dma_unmap_single(dev,
dx_buff->pa,
dx_buff->len,
DMA_TO_DEVICE);
} else {
- dma_unmap_page(aq_nic_get_dev(self),
+ dma_unmap_page(dev,
dx_buff->pa,
dx_buff->len,
DMA_TO_DEVICE);
int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
{
- unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
+ struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self);
+ unsigned int vec = skb->queue_mapping % cfg->vecs;
+ unsigned int tc = skb->queue_mapping / cfg->vecs;
struct aq_ring_s *ring = NULL;
unsigned int frags = 0U;
int err = NETDEV_TX_OK;
- unsigned int tc = 0U;
frags = skb_shinfo(skb)->nr_frags + 1;
- ring = self->aq_ring_tx[AQ_NIC_TCVEC2RING(self, tc, vec)];
+ ring = self->aq_ring_tx[AQ_NIC_CFG_TCVEC2RING(cfg, tc, vec)];
if (frags > AQ_CFG_SKB_FRAGS_MAX) {
dev_kfree_skb_any(skb);
aq_ring_update_queue_state(ring);
- if (self->aq_nic_cfg.priv_flags & BIT(AQ_HW_LOOPBACK_DMA_NET)) {
+ if (cfg->priv_flags & BIT(AQ_HW_LOOPBACK_DMA_NET)) {
err = NETDEV_TX_BUSY;
goto err_exit;
}
/* Above status update may stop the queue. Check this. */
- if (__netif_subqueue_stopped(self->ndev, ring->idx)) {
+ if (__netif_subqueue_stopped(self->ndev,
+ AQ_NIC_RING2QMAP(self, ring->idx))) {
err = NETDEV_TX_BUSY;
goto err_exit;
}
struct aq_stats_s *stats;
unsigned int count = 0U;
unsigned int i = 0U;
+ unsigned int tc;
if (self->aq_fw_ops->update_stats) {
mutex_lock(&self->fwreq_mutex);
data += i;
- for (i = 0U, aq_vec = self->aq_vec[0];
- aq_vec && self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) {
- data += count;
- aq_vec_get_sw_stats(aq_vec, data, &count);
+ for (tc = 0U; tc < self->aq_nic_cfg.tcs; tc++) {
+ for (i = 0U, aq_vec = self->aq_vec[0];
+ aq_vec && self->aq_vecs > i;
+ ++i, aq_vec = self->aq_vec[i]) {
+ data += count;
+ aq_vec_get_sw_stats(aq_vec, tc, data, &count);
+ }
}
data += count;
if (!self)
goto err_exit;
- for (i = 0U, aq_vec = self->aq_vec[0];
- self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
+ for (i = 0U; i < self->aq_vecs; i++) {
+ aq_vec = self->aq_vec[i];
aq_vec_deinit(aq_vec);
+ aq_vec_ring_free(aq_vec);
+ }
aq_ptp_unregister(self);
aq_ptp_ring_deinit(self);
err_exit:;
}
+int aq_nic_realloc_vectors(struct aq_nic_s *self)
+{
+ struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self);
+
+ aq_nic_free_vectors(self);
+
+ for (self->aq_vecs = 0; self->aq_vecs < cfg->vecs; self->aq_vecs++) {
+ self->aq_vec[self->aq_vecs] = aq_vec_alloc(self, self->aq_vecs,
+ cfg);
+ if (unlikely(!self->aq_vec[self->aq_vecs]))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
void aq_nic_shutdown(struct aq_nic_s *self)
{
int err = 0;
break;
}
}
+
+int aq_nic_setup_tc_mqprio(struct aq_nic_s *self, u32 tcs, u8 *prio_tc_map)
+{
+ struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+ const unsigned int prev_vecs = cfg->vecs;
+ bool ndev_running;
+ int err = 0;
+ int i;
+
+ /* if already the same configuration or
+ * disable request (tcs is 0) and we already is disabled
+ */
+ if (tcs == cfg->tcs || (tcs == 0 && !cfg->is_qos))
+ return 0;
+
+ ndev_running = netif_running(self->ndev);
+ if (ndev_running)
+ dev_close(self->ndev);
+
+ cfg->tcs = tcs;
+ if (cfg->tcs == 0)
+ cfg->tcs = 1;
+ if (prio_tc_map)
+ memcpy(cfg->prio_tc_map, prio_tc_map, sizeof(cfg->prio_tc_map));
+ else
+ for (i = 0; i < sizeof(cfg->prio_tc_map); i++)
+ cfg->prio_tc_map[i] = cfg->tcs * i / 8;
+
+ cfg->is_qos = (tcs != 0 ? true : false);
+ cfg->is_ptp = (cfg->tcs <= AQ_HW_PTP_TC);
+ if (!cfg->is_ptp)
+ netdev_warn(self->ndev, "%s\n",
+ "PTP is auto disabled due to requested TC count.");
+
+ netdev_set_num_tc(self->ndev, cfg->tcs);
+
+ /* Changing the number of TCs might change the number of vectors */
+ aq_nic_cfg_update_num_vecs(self);
+ if (prev_vecs != cfg->vecs) {
+ err = aq_nic_realloc_vectors(self);
+ if (err)
+ goto err_exit;
+ }
+
+ if (ndev_running)
+ err = dev_open(self->ndev, NULL);
+
+err_exit:
+ return err;
+}
+
+int aq_nic_setup_tc_max_rate(struct aq_nic_s *self, const unsigned int tc,
+ const u32 max_rate)
+{
+ struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+
+ if (tc >= AQ_CFG_TCS_MAX)
+ return -EINVAL;
+
+ if (max_rate && max_rate < 10) {
+ netdev_warn(self->ndev,
+ "Setting %s to the minimum usable value of %dMbps.\n",
+ "max rate", 10);
+ cfg->tc_max_rate[tc] = 10;
+ } else {
+ cfg->tc_max_rate[tc] = max_rate;
+ }
+
+ return 0;
+}
+
+int aq_nic_setup_tc_min_rate(struct aq_nic_s *self, const unsigned int tc,
+ const u32 min_rate)
+{
+ struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+
+ if (tc >= AQ_CFG_TCS_MAX)
+ return -EINVAL;
+
+ if (min_rate)
+ set_bit(tc, &cfg->tc_min_rate_msk);
+ else
+ clear_bit(tc, &cfg->tc_min_rate_msk);
+
+ if (min_rate && min_rate < 20) {
+ netdev_warn(self->ndev,
+ "Setting %s to the minimum usable value of %dMbps.\n",
+ "min rate", 20);
+ cfg->tc_min_rate[tc] = 20;
+ } else {
+ cfg->tc_min_rate[tc] = min_rate;
+ }
+
+ return 0;
+}
bool is_polling;
bool is_rss;
bool is_lro;
+ bool is_qos;
+ bool is_ptp;
+ enum aq_tc_mode tc_mode;
u32 priv_flags;
u8 tcs;
+ u8 prio_tc_map[8];
+ u32 tc_max_rate[AQ_CFG_TCS_MAX];
+ unsigned long tc_min_rate_msk;
+ u32 tc_min_rate[AQ_CFG_TCS_MAX];
struct aq_rss_parameters aq_rss;
u32 eee_speeds;
};
#define AQ_NIC_WOL_MODES (WAKE_MAGIC |\
WAKE_PHY)
-#define AQ_NIC_TCVEC2RING(_NIC_, _TC_, _VEC_) \
- ((_TC_) * AQ_CFG_TCS_MAX + (_VEC_))
+#define AQ_NIC_CFG_RING_PER_TC(_NIC_CFG_) \
+ (((_NIC_CFG_)->tc_mode == AQ_TC_MODE_4TCS) ? 8 : 4)
+
+#define AQ_NIC_CFG_TCVEC2RING(_NIC_CFG_, _TC_, _VEC_) \
+ ((_TC_) * AQ_NIC_CFG_RING_PER_TC(_NIC_CFG_) + (_VEC_))
+
+#define AQ_NIC_RING2QMAP(_NIC_, _ID_) \
+ ((_ID_) / AQ_NIC_CFG_RING_PER_TC(&(_NIC_)->aq_nic_cfg) * \
+ (_NIC_)->aq_vecs + \
+ ((_ID_) % AQ_NIC_CFG_RING_PER_TC(&(_NIC_)->aq_nic_cfg)))
struct aq_hw_rx_fl2 {
struct aq_rx_filter_vlan aq_vlans[AQ_VLAN_MAX_FILTERS];
atomic_t flags;
u32 msg_enable;
struct aq_vec_s *aq_vec[AQ_CFG_VECS_MAX];
- struct aq_ring_s *aq_ring_tx[AQ_CFG_VECS_MAX * AQ_CFG_TCS_MAX];
+ struct aq_ring_s *aq_ring_tx[AQ_HW_QUEUES_MAX];
struct aq_hw_s *aq_hw;
struct net_device *ndev;
unsigned int aq_vecs;
void aq_nic_set_power(struct aq_nic_s *self);
void aq_nic_free_hot_resources(struct aq_nic_s *self);
void aq_nic_free_vectors(struct aq_nic_s *self);
+int aq_nic_realloc_vectors(struct aq_nic_s *self);
int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu);
int aq_nic_set_mac(struct aq_nic_s *self, struct net_device *ndev);
int aq_nic_set_packet_filter(struct aq_nic_s *self, unsigned int flags);
u8 aq_nic_reserve_filter(struct aq_nic_s *self, enum aq_rx_filter_type type);
void aq_nic_release_filter(struct aq_nic_s *self, enum aq_rx_filter_type type,
u32 location);
+int aq_nic_setup_tc_mqprio(struct aq_nic_s *self, u32 tcs, u8 *prio_tc_map);
+int aq_nic_setup_tc_max_rate(struct aq_nic_s *self, const unsigned int tc,
+ const u32 max_rate);
+int aq_nic_setup_tc_min_rate(struct aq_nic_s *self, const unsigned int tc,
+ const u32 min_rate);
#endif /* AQ_NIC_H */
netif_tx_start_all_queues(nic->ndev);
err_exit:
+ if (ret < 0)
+ aq_nic_deinit(nic, true);
+
rtnl_unlock();
return ret;
#define PTP_4TC_RING_IDX 16
#define PTP_HWST_RING_IDX 31
+/* Index must be 8 (8 TCs) or 16 (4 TCs).
+ * It depends on Traffic Class mode.
+ */
+static unsigned int ptp_ring_idx(const enum aq_tc_mode tc_mode)
+{
+ if (tc_mode == AQ_TC_MODE_8TCS)
+ return PTP_8TC_RING_IDX;
+
+ return PTP_4TC_RING_IDX;
+}
+
int aq_ptp_ring_alloc(struct aq_nic_s *aq_nic)
{
struct aq_ptp_s *aq_ptp = aq_nic->aq_ptp;
unsigned int tx_ring_idx, rx_ring_idx;
struct aq_ring_s *hwts;
- u32 tx_tc_mode, rx_tc_mode;
struct aq_ring_s *ring;
int err;
if (!aq_ptp)
return 0;
- /* Index must to be 8 (8 TCs) or 16 (4 TCs).
- * It depends from Traffic Class mode.
- */
- aq_nic->aq_hw_ops->hw_tx_tc_mode_get(aq_nic->aq_hw, &tx_tc_mode);
- if (tx_tc_mode == 0)
- tx_ring_idx = PTP_8TC_RING_IDX;
- else
- tx_ring_idx = PTP_4TC_RING_IDX;
+ tx_ring_idx = ptp_ring_idx(aq_nic->aq_nic_cfg.tc_mode);
ring = aq_ring_tx_alloc(&aq_ptp->ptp_tx, aq_nic,
tx_ring_idx, &aq_nic->aq_nic_cfg);
goto err_exit;
}
- aq_nic->aq_hw_ops->hw_rx_tc_mode_get(aq_nic->aq_hw, &rx_tc_mode);
- if (rx_tc_mode == 0)
- rx_ring_idx = PTP_8TC_RING_IDX;
- else
- rx_ring_idx = PTP_4TC_RING_IDX;
+ rx_ring_idx = ptp_ring_idx(aq_nic->aq_nic_cfg.tc_mode);
ring = aq_ring_rx_alloc(&aq_ptp->ptp_rx, aq_nic,
rx_ring_idx, &aq_nic->aq_nic_cfg);
{
struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
- if (__netif_subqueue_stopped(ndev, ring->idx)) {
- netif_wake_subqueue(ndev, ring->idx);
+ if (__netif_subqueue_stopped(ndev,
+ AQ_NIC_RING2QMAP(ring->aq_nic,
+ ring->idx))) {
+ netif_wake_subqueue(ndev,
+ AQ_NIC_RING2QMAP(ring->aq_nic, ring->idx));
ring->stats.tx.queue_restarts++;
}
}
{
struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
- if (!__netif_subqueue_stopped(ndev, ring->idx))
- netif_stop_subqueue(ndev, ring->idx);
+ if (!__netif_subqueue_stopped(ndev,
+ AQ_NIC_RING2QMAP(ring->aq_nic,
+ ring->idx)))
+ netif_stop_subqueue(ndev,
+ AQ_NIC_RING2QMAP(ring->aq_nic, ring->idx));
}
bool aq_ring_tx_clean(struct aq_ring_s *self)
buff->is_hash_l4 ? PKT_HASH_TYPE_L4 :
PKT_HASH_TYPE_NONE);
/* Send all PTP traffic to 0 queue */
- skb_record_rx_queue(skb, is_ptp_ring ? 0 : self->idx);
+ skb_record_rx_queue(skb,
+ is_ptp_ring ? 0
+ : AQ_NIC_RING2QMAP(self->aq_nic,
+ self->idx));
++self->stats.rx.packets;
self->stats.rx.bytes += skb->len;
struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx,
struct aq_nic_cfg_s *aq_nic_cfg)
{
- struct aq_ring_s *ring = NULL;
struct aq_vec_s *self = NULL;
- unsigned int i = 0U;
- int err = 0;
self = kzalloc(sizeof(*self), GFP_KERNEL);
- if (!self) {
- err = -ENOMEM;
+ if (!self)
goto err_exit;
- }
self->aq_nic = aq_nic;
self->aq_ring_param.vec_idx = idx;
netif_napi_add(aq_nic_get_ndev(aq_nic), &self->napi,
aq_vec_poll, AQ_CFG_NAPI_WEIGHT);
+err_exit:
+ return self;
+}
+
+int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic,
+ unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg)
+{
+ struct aq_ring_s *ring = NULL;
+ unsigned int i = 0U;
+ int err = 0;
+
for (i = 0; i < aq_nic_cfg->tcs; ++i) {
- unsigned int idx_ring = AQ_NIC_TCVEC2RING(self->nic,
- self->tx_rings,
- self->aq_ring_param.vec_idx);
+ const unsigned int idx_ring = AQ_NIC_CFG_TCVEC2RING(aq_nic_cfg,
+ i, idx);
ring = aq_ring_tx_alloc(&self->ring[i][AQ_VEC_TX_ID], aq_nic,
idx_ring, aq_nic_cfg);
err_exit:
if (err < 0) {
- aq_vec_free(self);
+ aq_vec_ring_free(self);
self = NULL;
}
- return self;
+ return err;
}
int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops,
}
void aq_vec_free(struct aq_vec_s *self)
+{
+ if (!self)
+ goto err_exit;
+
+ netif_napi_del(&self->napi);
+
+ kfree(self);
+
+err_exit:;
+}
+
+void aq_vec_ring_free(struct aq_vec_s *self)
{
struct aq_ring_s *ring = NULL;
unsigned int i = 0U;
for (i = 0U, ring = self->ring[0];
self->tx_rings > i; ++i, ring = self->ring[i]) {
aq_ring_free(&ring[AQ_VEC_TX_ID]);
- aq_ring_free(&ring[AQ_VEC_RX_ID]);
+ if (i < self->rx_rings)
+ aq_ring_free(&ring[AQ_VEC_RX_ID]);
}
- netif_napi_del(&self->napi);
-
- kfree(self);
-
+ self->tx_rings = 0;
+ self->rx_rings = 0;
err_exit:;
}
return &self->aq_ring_param.affinity_mask;
}
-void aq_vec_add_stats(struct aq_vec_s *self,
- struct aq_ring_stats_rx_s *stats_rx,
- struct aq_ring_stats_tx_s *stats_tx)
+static void aq_vec_add_stats(struct aq_vec_s *self,
+ const unsigned int tc,
+ struct aq_ring_stats_rx_s *stats_rx,
+ struct aq_ring_stats_tx_s *stats_tx)
{
- struct aq_ring_s *ring = NULL;
- unsigned int r = 0U;
+ struct aq_ring_s *ring = self->ring[tc];
- for (r = 0U, ring = self->ring[0];
- self->tx_rings > r; ++r, ring = self->ring[r]) {
- struct aq_ring_stats_tx_s *tx = &ring[AQ_VEC_TX_ID].stats.tx;
+ if (tc < self->rx_rings) {
struct aq_ring_stats_rx_s *rx = &ring[AQ_VEC_RX_ID].stats.rx;
stats_rx->packets += rx->packets;
stats_rx->pg_losts += rx->pg_losts;
stats_rx->pg_flips += rx->pg_flips;
stats_rx->pg_reuses += rx->pg_reuses;
+ }
+
+ if (tc < self->tx_rings) {
+ struct aq_ring_stats_tx_s *tx = &ring[AQ_VEC_TX_ID].stats.tx;
stats_tx->packets += tx->packets;
stats_tx->bytes += tx->bytes;
}
}
-int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data, unsigned int *p_count)
+int aq_vec_get_sw_stats(struct aq_vec_s *self, const unsigned int tc, u64 *data,
+ unsigned int *p_count)
{
struct aq_ring_stats_rx_s stats_rx;
struct aq_ring_stats_tx_s stats_tx;
memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
- aq_vec_add_stats(self, &stats_rx, &stats_tx);
+
+ aq_vec_add_stats(self, tc, &stats_rx, &stats_tx);
/* This data should mimic aq_ethtool_queue_stat_names structure
*/
irqreturn_t aq_vec_isr_legacy(int irq, void *private);
struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx,
struct aq_nic_cfg_s *aq_nic_cfg);
+int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic,
+ unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg);
int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops,
struct aq_hw_s *aq_hw);
void aq_vec_deinit(struct aq_vec_s *self);
void aq_vec_free(struct aq_vec_s *self);
+void aq_vec_ring_free(struct aq_vec_s *self);
int aq_vec_start(struct aq_vec_s *self);
void aq_vec_stop(struct aq_vec_s *self);
cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self);
-int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data,
+int aq_vec_get_sw_stats(struct aq_vec_s *self, const unsigned int tc, u64 *data,
unsigned int *p_count);
-void aq_vec_add_stats(struct aq_vec_s *self,
- struct aq_ring_stats_rx_s *stats_rx,
- struct aq_ring_stats_tx_s *stats_tx);
#endif /* AQ_VEC_H */
.msix_irqs = 4U, \
.irq_mask = ~0U, \
.vecs = HW_ATL_A0_RSS_MAX, \
- .tcs = HW_ATL_A0_TC_MAX, \
+ .tcs_max = HW_ATL_A0_TC_MAX, \
.rxd_alignment = 1U, \
.rxd_size = HW_ATL_A0_RXD_SIZE, \
.rxds_max = HW_ATL_A0_MAX_RXD, \
hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U);
- hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF, 0U);
- hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0x64, 0U);
- hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, 0U);
- hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, 0U);
+ hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0U, 0xFFF);
+ hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0U, 0x64);
+ hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0U, 0x50);
+ hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0U, 0x1E);
/* Tx buf size */
buff_size = HW_ATL_A0_TXBUF_MAX;
.msix_irqs = 8U, \
.irq_mask = ~0U, \
.vecs = HW_ATL_B0_RSS_MAX, \
- .tcs = HW_ATL_B0_TC_MAX, \
+ .tcs_max = HW_ATL_B0_TC_MAX, \
.rxd_alignment = 1U, \
.rxd_size = HW_ATL_B0_RXD_SIZE, \
.rxds_max = HW_ATL_B0_MAX_RXD, \
NETIF_F_HW_VLAN_CTAG_RX | \
NETIF_F_HW_VLAN_CTAG_TX | \
NETIF_F_GSO_UDP_L4 | \
- NETIF_F_GSO_PARTIAL, \
+ NETIF_F_GSO_PARTIAL | \
+ NETIF_F_HW_TC, \
.hw_priv_flags = IFF_UNICAST_FLT, \
.flow_control = true, \
.mtu = HW_ATL_B0_MTU_JUMBO, \
return 0;
}
+static int hw_atl_b0_tc_ptp_set(struct aq_hw_s *self)
+{
+ /* Init TC2 for PTP_TX */
+ hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_TXBUF_SIZE,
+ AQ_HW_PTP_TC);
+
+ /* Init TC2 for PTP_RX */
+ hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_RXBUF_SIZE,
+ AQ_HW_PTP_TC);
+ /* No flow control for PTP */
+ hw_atl_rpb_rx_xoff_en_per_tc_set(self, 0U, AQ_HW_PTP_TC);
+
+ return aq_hw_err_from_flags(self);
+}
+
static int hw_atl_b0_hw_qos_set(struct aq_hw_s *self)
{
- unsigned int i_priority = 0U;
- u32 buff_size = 0U;
+ struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
+ u32 tx_buff_size = HW_ATL_B0_TXBUF_MAX;
+ u32 rx_buff_size = HW_ATL_B0_RXBUF_MAX;
+ unsigned int prio = 0U;
u32 tc = 0U;
+ if (cfg->is_ptp) {
+ tx_buff_size -= HW_ATL_B0_PTP_TXBUF_SIZE;
+ rx_buff_size -= HW_ATL_B0_PTP_RXBUF_SIZE;
+ }
+
/* TPS Descriptor rate init */
hw_atl_tps_tx_pkt_shed_desc_rate_curr_time_res_set(self, 0x0U);
hw_atl_tps_tx_pkt_shed_desc_rate_lim_set(self, 0xA);
/* TPS VM init */
hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(self, 0U);
- /* TPS TC credits init */
- hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
- hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U);
-
- tc = 0;
-
- /* TX Packet Scheduler Data TC0 */
- hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF, tc);
- hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0x64, tc);
- hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, tc);
- hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, tc);
-
- /* Tx buf size TC0 */
- buff_size = HW_ATL_B0_TXBUF_MAX - HW_ATL_B0_PTP_TXBUF_SIZE;
-
- hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, buff_size, tc);
- hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self,
- (buff_size *
- (1024 / 32U) * 66U) /
- 100U, tc);
- hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self,
- (buff_size *
- (1024 / 32U) * 50U) /
- 100U, tc);
- /* Init TC2 for PTP_TX */
- tc = 2;
+ tx_buff_size /= cfg->tcs;
+ rx_buff_size /= cfg->tcs;
+ for (tc = 0; tc < cfg->tcs; tc++) {
+ u32 threshold = 0U;
- hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_TXBUF_SIZE,
- tc);
+ /* Tx buf size TC0 */
+ hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc);
- /* QoS Rx buf size per TC */
- tc = 0;
- buff_size = HW_ATL_B0_RXBUF_MAX - HW_ATL_B0_PTP_RXBUF_SIZE;
+ threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U;
+ hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc);
- hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, buff_size, tc);
- hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self,
- (buff_size *
- (1024U / 32U) * 66U) /
- 100U, tc);
- hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self,
- (buff_size *
- (1024U / 32U) * 50U) /
- 100U, tc);
+ threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U;
+ hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc);
- hw_atl_b0_set_fc(self, self->aq_nic_cfg->fc.req, tc);
+ /* QoS Rx buf size per TC */
+ hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc);
- /* Init TC2 for PTP_RX */
- tc = 2;
+ threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U;
+ hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc);
- hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_RXBUF_SIZE,
- tc);
- /* No flow control for PTP */
- hw_atl_rpb_rx_xoff_en_per_tc_set(self, 0U, tc);
+ threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U;
+ hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+
+ hw_atl_b0_set_fc(self, self->aq_nic_cfg->fc.req, tc);
+ }
+
+ if (cfg->is_ptp)
+ hw_atl_b0_tc_ptp_set(self);
/* QoS 802.1p priority -> TC mapping */
- for (i_priority = 8U; i_priority--;)
- hw_atl_rpf_rpb_user_priority_tc_map_set(self, i_priority, 0U);
+ for (prio = 0; prio < 8; ++prio)
+ hw_atl_rpf_rpb_user_priority_tc_map_set(self, prio,
+ cfg->prio_tc_map[prio]);
return aq_hw_err_from_flags(self);
}
return aq_hw_err_from_flags(self);
}
+static int hw_atl_b0_hw_init_tx_tc_rate_limit(struct aq_hw_s *self)
+{
+ static const u32 max_weight = BIT(HW_ATL_TPS_DATA_TCTWEIGHT_WIDTH) - 1;
+ /* Scale factor is based on the number of bits in fractional portion */
+ static const u32 scale = BIT(HW_ATL_TPS_DESC_RATE_Y_WIDTH);
+ static const u32 frac_msk = HW_ATL_TPS_DESC_RATE_Y_MSK >>
+ HW_ATL_TPS_DESC_RATE_Y_SHIFT;
+ const u32 link_speed = self->aq_link_status.mbps;
+ struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+ unsigned long num_min_rated_tcs = 0;
+ u32 tc_weight[AQ_CFG_TCS_MAX];
+ u32 fixed_max_credit;
+ u8 min_rate_msk = 0;
+ u32 sum_weight = 0;
+ int tc;
+
+ /* By default max_credit is based upon MTU (in unit of 64b) */
+ fixed_max_credit = nic_cfg->aq_hw_caps->mtu / 64;
+
+ if (link_speed) {
+ min_rate_msk = nic_cfg->tc_min_rate_msk &
+ (BIT(nic_cfg->tcs) - 1);
+ num_min_rated_tcs = hweight8(min_rate_msk);
+ }
+
+ /* First, calculate weights where min_rate is specified */
+ if (num_min_rated_tcs) {
+ for (tc = 0; tc != nic_cfg->tcs; tc++) {
+ if (!nic_cfg->tc_min_rate[tc]) {
+ tc_weight[tc] = 0;
+ continue;
+ }
+
+ tc_weight[tc] = (-1L + link_speed +
+ nic_cfg->tc_min_rate[tc] *
+ max_weight) /
+ link_speed;
+ tc_weight[tc] = min(tc_weight[tc], max_weight);
+ sum_weight += tc_weight[tc];
+ }
+ }
+
+ /* WSP, if min_rate is set for at least one TC.
+ * RR otherwise.
+ */
+ hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, min_rate_msk ? 1U : 0U);
+ /* Data TC Arbiter takes precedence over Descriptor TC Arbiter,
+ * leave Descriptor TC Arbiter as RR.
+ */
+ hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
+
+ hw_atl_tps_tx_desc_rate_mode_set(self, nic_cfg->is_qos ? 1U : 0U);
+
+ for (tc = 0; tc != nic_cfg->tcs; tc++) {
+ const u32 en = (nic_cfg->tc_max_rate[tc] != 0) ? 1U : 0U;
+ const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+ u32 weight, max_credit;
+
+ hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, tc,
+ fixed_max_credit);
+ hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, tc, 0x1E);
+
+ if (num_min_rated_tcs) {
+ weight = tc_weight[tc];
+
+ if (!weight && sum_weight < max_weight)
+ weight = (max_weight - sum_weight) /
+ (nic_cfg->tcs - num_min_rated_tcs);
+ else if (!weight)
+ weight = 0x64;
+
+ max_credit = max(8 * weight, fixed_max_credit);
+ } else {
+ weight = 0x64;
+ max_credit = 0xFFF;
+ }
+
+ hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, tc, weight);
+ hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, tc,
+ max_credit);
+
+ hw_atl_tps_tx_desc_rate_en_set(self, desc, en);
+
+ if (en) {
+ /* Nominal rate is always 10G */
+ const u32 rate = 10000U * scale /
+ nic_cfg->tc_max_rate[tc];
+ const u32 rate_int = rate >>
+ HW_ATL_TPS_DESC_RATE_Y_WIDTH;
+ const u32 rate_frac = rate & frac_msk;
+
+ hw_atl_tps_tx_desc_rate_x_set(self, desc, rate_int);
+ hw_atl_tps_tx_desc_rate_y_set(self, desc, rate_frac);
+ } else {
+ /* A value of 1 indicates the queue is not
+ * rate controlled.
+ */
+ hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+ hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+ }
+ }
+ for (tc = nic_cfg->tcs; tc != AQ_CFG_TCS_MAX; tc++) {
+ const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+
+ hw_atl_tps_tx_desc_rate_en_set(self, desc, 0U);
+ hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+ hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+ }
+
+ return aq_hw_err_from_flags(self);
+}
+
static int hw_atl_b0_hw_init_tx_path(struct aq_hw_s *self)
{
+ struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+
/* Tx TC/Queue number config */
- hw_atl_tpb_tps_tx_tc_mode_set(self, 1U);
+ hw_atl_tpb_tps_tx_tc_mode_set(self, nic_cfg->tc_mode);
hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U);
hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U);
return aq_hw_err_from_flags(self);
}
+void hw_atl_b0_hw_init_rx_rss_ctrl1(struct aq_hw_s *self)
+{
+ struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
+ u32 rss_ctrl1 = HW_ATL_RSS_DISABLED;
+
+ if (cfg->is_rss)
+ rss_ctrl1 = (cfg->tc_mode == AQ_TC_MODE_8TCS) ?
+ HW_ATL_RSS_ENABLED_8TCS_2INDEX_BITS :
+ HW_ATL_RSS_ENABLED_4TCS_3INDEX_BITS;
+
+ hw_atl_reg_rx_flr_rss_control1set(self, rss_ctrl1);
+}
+
static int hw_atl_b0_hw_init_rx_path(struct aq_hw_s *self)
{
struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
int i;
/* Rx TC/RSS number config */
- hw_atl_rpb_rpf_rx_traf_class_mode_set(self, 1U);
+ hw_atl_rpb_rpf_rx_traf_class_mode_set(self, cfg->tc_mode);
/* Rx flow control */
hw_atl_rpb_rx_flow_ctl_mode_set(self, 1U);
/* RSS Ring selection */
- hw_atl_reg_rx_flr_rss_control1set(self, cfg->is_rss ?
- 0xB3333333U : 0x00000000U);
+ hw_atl_b0_hw_init_rx_rss_ctrl1(self);
/* Multicast filters */
for (i = HW_ATL_B0_MAC_MAX; i--;) {
return aq_hw_err_from_flags(self);
}
-static int hw_atl_b0_tx_tc_mode_get(struct aq_hw_s *self, u32 *tc_mode)
-{
- *tc_mode = hw_atl_tpb_tps_tx_tc_mode_get(self);
- return aq_hw_err_from_flags(self);
-}
-
-static int hw_atl_b0_rx_tc_mode_get(struct aq_hw_s *self, u32 *tc_mode)
-{
- *tc_mode = hw_atl_rpb_rpf_rx_traf_class_mode_get(self);
- return aq_hw_err_from_flags(self);
-}
-
#define get_ptp_ts_val_u64(self, indx) \
((u64)(hw_atl_pcs_ptp_clock_get(self, indx) & 0xffff))
.hw_interrupt_moderation_set = hw_atl_b0_hw_interrupt_moderation_set,
.hw_rss_set = hw_atl_b0_hw_rss_set,
.hw_rss_hash_set = hw_atl_b0_hw_rss_hash_set,
+ .hw_tc_rate_limit_set = hw_atl_b0_hw_init_tx_tc_rate_limit,
.hw_get_regs = hw_atl_utils_hw_get_regs,
.hw_get_hw_stats = hw_atl_utils_get_hw_stats,
.hw_get_fw_version = hw_atl_utils_get_fw_version,
- .hw_tx_tc_mode_get = hw_atl_b0_tx_tc_mode_get,
- .hw_rx_tc_mode_get = hw_atl_b0_rx_tc_mode_get,
-
.hw_ring_hwts_rx_fill = hw_atl_b0_hw_ring_hwts_rx_fill,
.hw_ring_hwts_rx_receive = hw_atl_b0_hw_ring_hwts_rx_receive,
int hw_atl_b0_hw_ring_tx_stop(struct aq_hw_s *self, struct aq_ring_s *ring);
int hw_atl_b0_hw_ring_rx_stop(struct aq_hw_s *self, struct aq_ring_s *ring);
+void hw_atl_b0_hw_init_rx_rss_ctrl1(struct aq_hw_s *self);
+
int hw_atl_b0_hw_mac_addr_set(struct aq_hw_s *self, u8 *mac_addr);
int hw_atl_b0_hw_start(struct aq_hw_s *self);
#define HW_ATL_B0_RSS_HASHKEY_BITS 320U
#define HW_ATL_B0_TCRSS_4_8 1
-#define HW_ATL_B0_TC_MAX 1U
+#define HW_ATL_B0_TC_MAX 8U
#define HW_ATL_B0_RSS_MAX 8U
#define HW_ATL_B0_LRO_RXD_MAX 16U
#define HW_ATL_B0_MAX_RXD 8184U
#define HW_ATL_B0_MAX_TXD 8184U
+#define HW_ATL_RSS_DISABLED 0x00000000U
+#define HW_ATL_RSS_ENABLED_8TCS_2INDEX_BITS 0xA2222222U
+#define HW_ATL_RSS_ENABLED_4TCS_3INDEX_BITS 0x80003333U
+
/* HW layer capabilities */
#endif /* HW_ATL_B0_INTERNAL_H */
}
void hw_atl_rpf_rpb_user_priority_tc_map_set(struct aq_hw_s *aq_hw,
- u32 user_priority_tc_map, u32 tc)
+ u32 user_priority, u32 tc)
{
/* register address for bitfield rx_tc_up{t}[2:0] */
static u32 rpf_rpb_rx_tc_upt_adr[8] = {
0U, 4U, 8U, 12U, 16U, 20U, 24U, 28U
};
- aq_hw_write_reg_bit(aq_hw, rpf_rpb_rx_tc_upt_adr[tc],
- rpf_rpb_rx_tc_upt_msk[tc],
- rpf_rpb_rx_tc_upt_shft[tc],
- user_priority_tc_map);
+ aq_hw_write_reg_bit(aq_hw, rpf_rpb_rx_tc_upt_adr[user_priority],
+ rpf_rpb_rx_tc_upt_msk[user_priority],
+ rpf_rpb_rx_tc_upt_shft[user_priority], tc);
}
void hw_atl_rpf_rss_key_addr_set(struct aq_hw_s *aq_hw, u32 rss_key_addr)
}
void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw,
- u32 max_credit,
- u32 tc)
+ const u32 tc,
+ const u32 max_credit)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_TCTCREDIT_MAX_ADR(tc),
HW_ATL_TPS_DESC_TCTCREDIT_MAX_MSK,
}
void hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(struct aq_hw_s *aq_hw,
- u32 tx_pkt_shed_desc_tc_weight,
- u32 tc)
+ const u32 tc,
+ const u32 weight)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_TCTWEIGHT_ADR(tc),
HW_ATL_TPS_DESC_TCTWEIGHT_MSK,
HW_ATL_TPS_DESC_TCTWEIGHT_SHIFT,
- tx_pkt_shed_desc_tc_weight);
+ weight);
}
void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw,
}
void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
- u32 max_credit,
- u32 tc)
+ const u32 tc,
+ const u32 max_credit)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DATA_TCTCREDIT_MAX_ADR(tc),
HW_ATL_TPS_DATA_TCTCREDIT_MAX_MSK,
}
void hw_atl_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
- u32 tx_pkt_shed_tc_data_weight,
- u32 tc)
+ const u32 tc,
+ const u32 weight)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DATA_TCTWEIGHT_ADR(tc),
HW_ATL_TPS_DATA_TCTWEIGHT_MSK,
HW_ATL_TPS_DATA_TCTWEIGHT_SHIFT,
- tx_pkt_shed_tc_data_weight);
+ weight);
+}
+
+void hw_atl_tps_tx_desc_rate_mode_set(struct aq_hw_s *aq_hw,
+ const u32 rate_mode)
+{
+ aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_TX_DESC_RATE_MODE_ADR,
+ HW_ATL_TPS_TX_DESC_RATE_MODE_MSK,
+ HW_ATL_TPS_TX_DESC_RATE_MODE_SHIFT,
+ rate_mode);
+}
+
+void hw_atl_tps_tx_desc_rate_en_set(struct aq_hw_s *aq_hw, const u32 desc,
+ const u32 enable)
+{
+ aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_EN_ADR(desc),
+ HW_ATL_TPS_DESC_RATE_EN_MSK,
+ HW_ATL_TPS_DESC_RATE_EN_SHIFT,
+ enable);
+}
+
+void hw_atl_tps_tx_desc_rate_x_set(struct aq_hw_s *aq_hw, const u32 desc,
+ const u32 rate_int)
+{
+ aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_X_ADR(desc),
+ HW_ATL_TPS_DESC_RATE_X_MSK,
+ HW_ATL_TPS_DESC_RATE_X_SHIFT,
+ rate_int);
+}
+
+void hw_atl_tps_tx_desc_rate_y_set(struct aq_hw_s *aq_hw, const u32 desc,
+ const u32 rate_frac)
+{
+ aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_Y_ADR(desc),
+ HW_ATL_TPS_DESC_RATE_Y_MSK,
+ HW_ATL_TPS_DESC_RATE_Y_SHIFT,
+ rate_frac);
}
/* tx */
/* set tx packet scheduler descriptor tc max credit */
void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw,
- u32 max_credit,
- u32 tc);
+ const u32 tc,
+ const u32 max_credit);
/* set tx packet scheduler descriptor tc weight */
void hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(struct aq_hw_s *aq_hw,
- u32 tx_pkt_shed_desc_tc_weight,
- u32 tc);
+ const u32 tc,
+ const u32 weight);
/* set tx packet scheduler descriptor vm arbitration mode */
void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw,
/* set tx packet scheduler tc data max credit */
void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
- u32 max_credit,
- u32 tc);
+ const u32 tc,
+ const u32 max_credit);
/* set tx packet scheduler tc data weight */
void hw_atl_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
- u32 tx_pkt_shed_tc_data_weight,
- u32 tc);
+ const u32 tc,
+ const u32 weight);
+
+/* set tx descriptor rate mode */
+void hw_atl_tps_tx_desc_rate_mode_set(struct aq_hw_s *aq_hw,
+ const u32 rate_mode);
+
+/* set tx packet scheduler descriptor rate enable */
+void hw_atl_tps_tx_desc_rate_en_set(struct aq_hw_s *aq_hw, const u32 desc,
+ const u32 enable);
+
+/* set tx packet scheduler descriptor rate integral value */
+void hw_atl_tps_tx_desc_rate_x_set(struct aq_hw_s *aq_hw, const u32 desc,
+ const u32 rate_int);
+
+/* set tx packet scheduler descriptor rate fractional value */
+void hw_atl_tps_tx_desc_rate_y_set(struct aq_hw_s *aq_hw, const u32 desc,
+ const u32 rate_frac);
/* tx */
/* default value of bitfield lso_tcp_flag_mid[b:0] */
#define HW_ATL_THM_LSO_TCP_FLAG_MID_DEFAULT 0x0
+/* tx tx_tc_mode bitfield definitions
+ * preprocessor definitions for the bitfield "tx_tc_mode".
+ * port="pif_tpb_tx_tc_mode_i,pif_tps_tx_tc_mode_i"
+ */
+
+/* register address for bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900
+/* bitmask for bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100
+/* inverted bitmask for bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF
+/* lower bit position of bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8
+/* width of bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1
+/* default value of bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0
+
+/* tx tx_desc_rate_mode bitfield definitions
+ * preprocessor definitions for the bitfield "tx_desc_rate_mode".
+ * port="pif_tps_desc_rate_mode_i"
+ */
+
+/* register address for bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_ADR 0x00007900
+/* bitmask for bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_MSK 0x00000080
+/* inverted bitmask for bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_MSKN 0xFFFFFF7F
+/* lower bit position of bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_SHIFT 7
+/* width of bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_WIDTH 1
+/* default value of bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_DEFAULT 0x0
+
/* tx tx_buf_en bitfield definitions
* preprocessor definitions for the bitfield "tx_buf_en".
* port="pif_tpb_tx_buf_en_i"
/* default value of bitfield tx_buf_en */
#define HW_ATL_TPB_TX_BUF_EN_DEFAULT 0x0
-/* register address for bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900
-/* bitmask for bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100
-/* inverted bitmask for bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF
-/* lower bit position of bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8
-/* width of bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1
-/* default value of bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0
-
/* tx tx{b}_hi_thresh[c:0] bitfield definitions
* preprocessor definitions for the bitfield "tx{b}_hi_thresh[c:0]".
* parameter: buffer {b} | stride size 0x10 | range [0, 7]
/* default value of bitfield data_tc_arb_mode */
#define HW_ATL_TPS_DATA_TC_ARB_MODE_DEFAULT 0x0
+/* tx desc{r}_rate_en bitfield definitions
+ * preprocessor definitions for the bitfield "desc{r}_rate_en".
+ * port="pif_tps_desc_rate_en_i[0]"
+ */
+
+/* register address for bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_ADR(desc) (0x00007408 + (desc) * 0x10)
+/* bitmask for bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_MSK 0x80000000
+/* inverted bitmask for bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_MSKN 0x7FFFFFFF
+/* lower bit position of bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_SHIFT 31
+/* width of bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_WIDTH 1
+/* default value of bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_DEFAULT 0x0
+
+/* tx desc{r}_rate_x bitfield definitions
+ * preprocessor definitions for the bitfield "desc{r}_rate_x".
+ * port="pif_tps_desc0_rate_x"
+ */
+/* register address for bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_ADR(desc) (0x00007408 + (desc) * 0x10)
+/* bitmask for bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_MSK 0x03FF0000
+/* inverted bitmask for bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_MSKN 0xFC00FFFF
+/* lower bit position of bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_SHIFT 16
+/* width of bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_WIDTH 10
+/* default value of bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_DEFAULT 0x0
+
+/* tx desc{r}_rate_y bitfield definitions
+ * preprocessor definitions for the bitfield "desc{r}_rate_y".
+ * port="pif_tps_desc0_rate_y"
+ */
+/* register address for bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_ADR(desc) (0x00007408 + (desc) * 0x10)
+/* bitmask for bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_MSK 0x00003FFF
+/* inverted bitmask for bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_MSKN 0xFFFFC000
+/* lower bit position of bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_SHIFT 0
+/* width of bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_WIDTH 14
+/* default value of bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_DEFAULT 0x0
+
/* tx desc_rate_ta_rst bitfield definitions
* preprocessor definitions for the bitfield "desc_rate_ta_rst".
* port="pif_tps_desc_rate_ta_rst_i"
#include "hw_atl/hw_atl_b0.h"
#include "hw_atl/hw_atl_utils.h"
#include "hw_atl/hw_atl_llh.h"
+#include "hw_atl/hw_atl_llh_internal.h"
#include "hw_atl2_utils.h"
#include "hw_atl2_llh.h"
#include "hw_atl2_internal.h"
.msix_irqs = 8U, \
.irq_mask = ~0U, \
.vecs = HW_ATL2_RSS_MAX, \
- .tcs = HW_ATL2_TC_MAX, \
+ .tcs_max = HW_ATL2_TC_MAX, \
.rxd_alignment = 1U, \
.rxd_size = HW_ATL2_RXD_SIZE, \
.rxds_max = HW_ATL2_MAX_RXD, \
NETIF_F_HW_VLAN_CTAG_RX | \
NETIF_F_HW_VLAN_CTAG_TX | \
NETIF_F_GSO_UDP_L4 | \
- NETIF_F_GSO_PARTIAL, \
+ NETIF_F_GSO_PARTIAL | \
+ NETIF_F_HW_TC, \
.hw_priv_flags = IFF_UNICAST_FLT, \
.flow_control = true, \
.mtu = HW_ATL2_MTU_JUMBO, \
static int hw_atl2_hw_queue_to_tc_map_set(struct aq_hw_s *self)
{
- if (!hw_atl_rpb_rpf_rx_traf_class_mode_get(self)) {
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(0), 0x11110000);
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(8), 0x33332222);
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(16), 0x55554444);
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(24), 0x77776666);
- } else {
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(0), 0x00000000);
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(8), 0x11111111);
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(16), 0x22222222);
- aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(24), 0x33333333);
+ struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
+ unsigned int tcs, q_per_tc;
+ unsigned int tc, q;
+ u32 rx_map = 0;
+ u32 tx_map = 0;
+
+ hw_atl2_tpb_tx_tc_q_rand_map_en_set(self, 1U);
+
+ switch (cfg->tc_mode) {
+ case AQ_TC_MODE_8TCS:
+ tcs = 8;
+ q_per_tc = 4;
+ break;
+ case AQ_TC_MODE_4TCS:
+ tcs = 4;
+ q_per_tc = 8;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ for (tc = 0; tc != tcs; tc++) {
+ unsigned int tc_q_offset = tc * q_per_tc;
+
+ for (q = tc_q_offset; q != tc_q_offset + q_per_tc; q++) {
+ rx_map |= tc << HW_ATL2_RX_Q_TC_MAP_SHIFT(q);
+ if (HW_ATL2_RX_Q_TC_MAP_ADR(q) !=
+ HW_ATL2_RX_Q_TC_MAP_ADR(q + 1)) {
+ aq_hw_write_reg(self,
+ HW_ATL2_RX_Q_TC_MAP_ADR(q),
+ rx_map);
+ rx_map = 0;
+ }
+
+ tx_map |= tc << HW_ATL2_TX_Q_TC_MAP_SHIFT(q);
+ if (HW_ATL2_TX_Q_TC_MAP_ADR(q) !=
+ HW_ATL2_TX_Q_TC_MAP_ADR(q + 1)) {
+ aq_hw_write_reg(self,
+ HW_ATL2_TX_Q_TC_MAP_ADR(q),
+ tx_map);
+ tx_map = 0;
+ }
+ }
}
return aq_hw_err_from_flags(self);
u32 tx_buff_size = HW_ATL2_TXBUF_MAX;
u32 rx_buff_size = HW_ATL2_RXBUF_MAX;
unsigned int prio = 0U;
- u32 threshold = 0U;
u32 tc = 0U;
/* TPS Descriptor rate init */
/* TPS VM init */
hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(self, 0U);
- /* TPS TC credits init */
- hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
- hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U);
+ tx_buff_size /= cfg->tcs;
+ rx_buff_size /= cfg->tcs;
+ for (tc = 0; tc < cfg->tcs; tc++) {
+ u32 threshold = 0U;
- tc = 0;
+ /* Tx buf size TC0 */
+ hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc);
- /* TX Packet Scheduler Data TC0 */
- hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF0, tc);
- hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(self, 0x640, tc);
- hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, tc);
- hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, tc);
+ threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U;
+ hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc);
- /* Tx buf size TC0 */
- hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc);
+ threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U;
+ hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc);
- threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U;
- hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc);
+ /* QoS Rx buf size per TC */
+ hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc);
- threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U;
- hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+ threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U;
+ hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc);
- /* QoS Rx buf size per TC */
- hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc);
-
- threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U;
- hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc);
-
- threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U;
- hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+ threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U;
+ hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+ }
/* QoS 802.1p priority -> TC mapping */
for (prio = 0; prio < 8; ++prio)
hw_atl_rpf_rpb_user_priority_tc_map_set(self, prio,
- cfg->tcs * prio / 8);
+ cfg->prio_tc_map[prio]);
- /* ATL2 Apply legacy ring to TC mapping */
+ /* ATL2 Apply ring to TC mapping */
hw_atl2_hw_queue_to_tc_map_set(self);
return aq_hw_err_from_flags(self);
static int hw_atl2_hw_rss_set(struct aq_hw_s *self,
struct aq_rss_parameters *rss_params)
{
- u8 *indirection_table = rss_params->indirection_table;
+ u8 *indirection_table = rss_params->indirection_table;
+ const u32 num_tcs = aq_hw_num_tcs(self);
+ u32 rpf_redir2_enable;
+ int tc;
int i;
- for (i = HW_ATL2_RSS_REDIRECTION_MAX; i--;)
- hw_atl2_new_rpf_rss_redir_set(self, 0, i, indirection_table[i]);
+ rpf_redir2_enable = num_tcs > 4 ? 1 : 0;
+
+ hw_atl2_rpf_redirection_table2_select_set(self, rpf_redir2_enable);
+
+ for (i = HW_ATL2_RSS_REDIRECTION_MAX; i--;) {
+ for (tc = 0; tc != num_tcs; tc++) {
+ hw_atl2_new_rpf_rss_redir_set(self, tc, i,
+ tc *
+ aq_hw_q_per_tc(self) +
+ indirection_table[i]);
+ }
+ }
+
+ return aq_hw_err_from_flags(self);
+}
+
+static int hw_atl2_hw_init_tx_tc_rate_limit(struct aq_hw_s *self)
+{
+ static const u32 max_weight = BIT(HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH) - 1;
+ /* Scale factor is based on the number of bits in fractional portion */
+ static const u32 scale = BIT(HW_ATL_TPS_DESC_RATE_Y_WIDTH);
+ static const u32 frac_msk = HW_ATL_TPS_DESC_RATE_Y_MSK >>
+ HW_ATL_TPS_DESC_RATE_Y_SHIFT;
+ const u32 link_speed = self->aq_link_status.mbps;
+ struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+ unsigned long num_min_rated_tcs = 0;
+ u32 tc_weight[AQ_CFG_TCS_MAX];
+ u32 fixed_max_credit_4b;
+ u32 fixed_max_credit;
+ u8 min_rate_msk = 0;
+ u32 sum_weight = 0;
+ int tc;
+
+ /* By default max_credit is based upon MTU (in unit of 64b) */
+ fixed_max_credit = nic_cfg->aq_hw_caps->mtu / 64;
+ /* in unit of 4b */
+ fixed_max_credit_4b = nic_cfg->aq_hw_caps->mtu / 4;
+
+ if (link_speed) {
+ min_rate_msk = nic_cfg->tc_min_rate_msk &
+ (BIT(nic_cfg->tcs) - 1);
+ num_min_rated_tcs = hweight8(min_rate_msk);
+ }
+
+ /* First, calculate weights where min_rate is specified */
+ if (num_min_rated_tcs) {
+ for (tc = 0; tc != nic_cfg->tcs; tc++) {
+ if (!nic_cfg->tc_min_rate[tc]) {
+ tc_weight[tc] = 0;
+ continue;
+ }
+
+ tc_weight[tc] = (-1L + link_speed +
+ nic_cfg->tc_min_rate[tc] *
+ max_weight) /
+ link_speed;
+ tc_weight[tc] = min(tc_weight[tc], max_weight);
+ sum_weight += tc_weight[tc];
+ }
+ }
+
+ /* WSP, if min_rate is set for at least one TC.
+ * RR otherwise.
+ */
+ hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(self, min_rate_msk ? 1U : 0U);
+ /* Data TC Arbiter takes precedence over Descriptor TC Arbiter,
+ * leave Descriptor TC Arbiter as RR.
+ */
+ hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
+
+ hw_atl_tps_tx_desc_rate_mode_set(self, nic_cfg->is_qos ? 1U : 0U);
+
+ for (tc = 0; tc != nic_cfg->tcs; tc++) {
+ const u32 en = (nic_cfg->tc_max_rate[tc] != 0) ? 1U : 0U;
+ const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+ u32 weight, max_credit;
+
+ hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, tc,
+ fixed_max_credit);
+ hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, tc, 0x1E);
+
+ if (num_min_rated_tcs) {
+ weight = tc_weight[tc];
+
+ if (!weight && sum_weight < max_weight)
+ weight = (max_weight - sum_weight) /
+ (nic_cfg->tcs - num_min_rated_tcs);
+ else if (!weight)
+ weight = 0x640;
+
+ max_credit = max(2 * weight, fixed_max_credit_4b);
+ } else {
+ weight = 0x640;
+ max_credit = 0xFFF0;
+ }
+
+ hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(self, tc, weight);
+ hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(self, tc,
+ max_credit);
+
+ hw_atl_tps_tx_desc_rate_en_set(self, desc, en);
+
+ if (en) {
+ /* Nominal rate is always 10G */
+ const u32 rate = 10000U * scale /
+ nic_cfg->tc_max_rate[tc];
+ const u32 rate_int = rate >>
+ HW_ATL_TPS_DESC_RATE_Y_WIDTH;
+ const u32 rate_frac = rate & frac_msk;
+
+ hw_atl_tps_tx_desc_rate_x_set(self, desc, rate_int);
+ hw_atl_tps_tx_desc_rate_y_set(self, desc, rate_frac);
+ } else {
+ /* A value of 1 indicates the queue is not
+ * rate controlled.
+ */
+ hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+ hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+ }
+ }
+ for (tc = nic_cfg->tcs; tc != AQ_CFG_TCS_MAX; tc++) {
+ const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+
+ hw_atl_tps_tx_desc_rate_en_set(self, desc, 0U);
+ hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+ hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+ }
return aq_hw_err_from_flags(self);
}
static int hw_atl2_hw_init_tx_path(struct aq_hw_s *self)
{
+ struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+
/* Tx TC/RSS number config */
- hw_atl_tpb_tps_tx_tc_mode_set(self, 1U);
+ hw_atl_tpb_tps_tx_tc_mode_set(self, nic_cfg->tc_mode);
hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U);
hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U);
static void hw_atl2_hw_init_new_rx_filters(struct aq_hw_s *self)
{
struct hw_atl2_priv *priv = (struct hw_atl2_priv *)self->priv;
+ u8 *prio_tc_map = self->aq_nic_cfg->prio_tc_map;
+ u16 action;
u8 index;
+ int i;
+ /* Action Resolver Table (ART) is used by RPF to decide which action
+ * to take with a packet based upon input tag and tag mask, where:
+ * - input tag is a combination of 3-bit VLan Prio (PTP) and
+ * 29-bit concatenation of all tags from filter block;
+ * - tag mask is a mask used for matching against input tag.
+ * The input_tag is compared with the all the Requested_tags in the
+ * Record table to find a match. Action field of the selected matched
+ * REC entry is used for further processing. If multiple entries match,
+ * the lowest REC entry, Action field will be selected.
+ */
hw_atl2_rpf_act_rslvr_section_en_set(self, 0xFFFF);
hw_atl2_rpfl2_uc_flr_tag_set(self, HW_ATL2_RPF_TAG_BASE_UC,
HW_ATL2_MAC_UC);
hw_atl2_rpfl2_bc_flr_tag_set(self, HW_ATL2_RPF_TAG_BASE_UC);
+ /* FW reserves the beginning of ART, thus all driver entries must
+ * start from the offset specified in FW caps.
+ */
index = priv->art_base_index + HW_ATL2_RPF_L2_PROMISC_OFF_INDEX;
hw_atl2_act_rslvr_table_set(self, index, 0,
HW_ATL2_RPF_TAG_UC_MASK |
HW_ATL2_RPF_TAG_UNTAG_MASK,
HW_ATL2_ACTION_DROP);
- index = priv->art_base_index + HW_ATL2_RPF_VLAN_INDEX;
- hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_VLAN,
- HW_ATL2_RPF_TAG_VLAN_MASK,
- HW_ATL2_ACTION_ASSIGN_TC(0));
-
- index = priv->art_base_index + HW_ATL2_RPF_MAC_INDEX;
- hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_UC,
- HW_ATL2_RPF_TAG_UC_MASK,
- HW_ATL2_ACTION_ASSIGN_TC(0));
-
- index = priv->art_base_index + HW_ATL2_RPF_ALLMC_INDEX;
- hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_ALLMC,
- HW_ATL2_RPF_TAG_ALLMC_MASK,
- HW_ATL2_ACTION_ASSIGN_TC(0));
+ /* Configure ART to map given VLan Prio (PCP) to the TC index for
+ * RSS redirection table.
+ */
+ for (i = 0; i < 8; i++) {
+ action = HW_ATL2_ACTION_ASSIGN_TC(prio_tc_map[i]);
- index = priv->art_base_index + HW_ATL2_RPF_UNTAG_INDEX;
- hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_UNTAG_MASK,
- HW_ATL2_RPF_TAG_UNTAG_MASK,
- HW_ATL2_ACTION_ASSIGN_TC(0));
-
- index = priv->art_base_index + HW_ATL2_RPF_VLAN_PROMISC_ON_INDEX;
- hw_atl2_act_rslvr_table_set(self, index, 0, HW_ATL2_RPF_TAG_VLAN_MASK,
- HW_ATL2_ACTION_DISABLE);
-
- index = priv->art_base_index + HW_ATL2_RPF_L2_PROMISC_ON_INDEX;
- hw_atl2_act_rslvr_table_set(self, index, 0, HW_ATL2_RPF_TAG_UC_MASK,
- HW_ATL2_ACTION_DISABLE);
+ index = priv->art_base_index + HW_ATL2_RPF_PCP_TO_TC_INDEX + i;
+ hw_atl2_act_rslvr_table_set(self, index,
+ i << HW_ATL2_RPF_TAG_PCP_OFFSET,
+ HW_ATL2_RPF_TAG_PCP_MASK, action);
+ }
}
static void hw_atl2_hw_new_rx_filter_vlan_promisc(struct aq_hw_s *self,
int i;
/* Rx TC/RSS number config */
- hw_atl_rpb_rpf_rx_traf_class_mode_set(self, 1U);
+ hw_atl_rpb_rpf_rx_traf_class_mode_set(self, cfg->tc_mode);
/* Rx flow control */
hw_atl_rpb_rx_flow_ctl_mode_set(self, 1U);
hw_atl2_rpf_rss_hash_type_set(self, HW_ATL2_RPF_RSS_HASH_TYPE_ALL);
/* RSS Ring selection */
- hw_atl_reg_rx_flr_rss_control1set(self, cfg->is_rss ?
- HW_ATL_RSS_ENABLED_3INDEX_BITS :
- HW_ATL_RSS_DISABLED);
+ hw_atl_b0_hw_init_rx_rss_ctrl1(self);
/* Multicast filters */
for (i = HW_ATL2_MAC_MAX; i--;) {
.hw_interrupt_moderation_set = hw_atl2_hw_interrupt_moderation_set,
.hw_rss_set = hw_atl2_hw_rss_set,
.hw_rss_hash_set = hw_atl_b0_hw_rss_hash_set,
+ .hw_tc_rate_limit_set = hw_atl2_hw_init_tx_tc_rate_limit,
.hw_get_hw_stats = hw_atl2_utils_get_hw_stats,
.hw_get_fw_version = hw_atl2_utils_get_fw_version,
.hw_set_offload = hw_atl_b0_hw_offload_set,
#define HW_ATL2_RSS_REDIRECTION_MAX 64U
-#define HW_ATL2_TC_MAX 1U
+#define HW_ATL2_TC_MAX 8U
#define HW_ATL2_RSS_MAX 8U
#define HW_ATL2_INTR_MODER_MAX 0x1FF
HW_ATL2_RPF_VLAN_USER_INDEX = HW_ATL2_RPF_ET_PCP_USER_INDEX + 16,
HW_ATL2_RPF_PCP_TO_TC_INDEX = HW_ATL2_RPF_VLAN_USER_INDEX +
HW_ATL_VLAN_MAX_FILTERS,
- HW_ATL2_RPF_VLAN_INDEX = HW_ATL2_RPF_PCP_TO_TC_INDEX +
- AQ_CFG_TCS_MAX,
- HW_ATL2_RPF_MAC_INDEX,
- HW_ATL2_RPF_ALLMC_INDEX,
- HW_ATL2_RPF_UNTAG_INDEX,
- HW_ATL2_RPF_VLAN_PROMISC_ON_INDEX,
- HW_ATL2_RPF_L2_PROMISC_ON_INDEX,
};
#define HW_ATL2_ACTION(ACTION, RSS, INDEX, VALID) \
HW_ATL2_RPF_RSS_HASH_TYPE_IPV6_EX_UDP,
};
-#define HW_ATL_RSS_DISABLED 0x00000000U
-#define HW_ATL_RSS_ENABLED_3INDEX_BITS 0xB3333333U
-
#define HW_ATL_MCAST_FLT_ANY_TO_HOST 0x00010FFFU
struct hw_atl2_priv {
#include "hw_atl2_llh_internal.h"
#include "aq_hw_utils.h"
+void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw,
+ u32 select)
+{
+ aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_ADR,
+ HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSK,
+ HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_SHIFT, select);
+}
+
void hw_atl2_rpf_rss_hash_type_set(struct aq_hw_s *aq_hw, u32 rss_hash_type)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_PIF_RPF_RSS_HASH_TYPEI_ADR,
/* TX */
+void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw,
+ const u32 tc_q_rand_map_en)
+{
+ aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_ADR,
+ HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSK,
+ HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_SHIFT,
+ tc_q_rand_map_en);
+}
+
void hw_atl2_tpb_tx_buf_clk_gate_en_set(struct aq_hw_s *aq_hw, u32 clk_gate_en)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_TX_BUF_CLK_GATE_EN_ADR,
tx_intr_moderation_ctl);
}
+void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw,
+ const u32 data_arb_mode)
+{
+ aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TC_ARB_MODE_ADR,
+ HW_ATL2_TPS_DATA_TC_ARB_MODE_MSK,
+ HW_ATL2_TPS_DATA_TC_ARB_MODE_SHIFT,
+ data_arb_mode);
+}
+
void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
- u32 max_credit,
- u32 tc)
+ const u32 tc,
+ const u32 max_credit)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TCTCREDIT_MAX_ADR(tc),
HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK,
}
void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
- u32 tx_pkt_shed_tc_data_weight,
- u32 tc)
+ const u32 tc,
+ const u32 weight)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TCTWEIGHT_ADR(tc),
HW_ATL2_TPS_DATA_TCTWEIGHT_MSK,
HW_ATL2_TPS_DATA_TCTWEIGHT_SHIFT,
- tx_pkt_shed_tc_data_weight);
+ weight);
}
u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw)
u32 tx_intr_moderation_ctl,
u32 queue);
+/* Set Redirection Table 2 Select */
+void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw,
+ u32 select);
+
/** Set RSS HASH type */
void hw_atl2_rpf_rss_hash_type_set(struct aq_hw_s *aq_hw, u32 rss_hash_type);
/* Set VLAN filter tag */
void hw_atl2_rpf_vlan_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter);
+/* set tx random TC-queue mapping enable bit */
+void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw,
+ const u32 tc_q_rand_map_en);
+
/* set tx buffer clock gate enable */
void hw_atl2_tpb_tx_buf_clk_gate_en_set(struct aq_hw_s *aq_hw, u32 clk_gate_en);
+void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw,
+ const u32 data_arb_mode);
+
/* set tx packet scheduler tc data max credit */
void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
- u32 max_credit,
- u32 tc);
+ const u32 tc,
+ const u32 max_credit);
/* set tx packet scheduler tc data weight */
void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
- u32 tx_pkt_shed_tc_data_weight,
- u32 tc);
+ const u32 tc,
+ const u32 weight);
u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw);
#ifndef HW_ATL2_LLH_INTERNAL_H
#define HW_ATL2_LLH_INTERNAL_H
+/* RX pif_rpf_redir_2_en_i Bitfield Definitions
+ * PORT="pif_rpf_redir_2_en_i"
+ */
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_ADR 0x000054C8
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSK 0x00001000
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSKN 0xFFFFEFFF
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_SHIFT 12
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_WIDTH 1
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_DEFAULT 0x0
+
/* RX pif_rpf_rss_hash_type_i Bitfield Definitions
*/
#define HW_ATL2_RPF_PIF_RPF_RSS_HASH_TYPEI_ADR 0x000054C8
/* Default value of bitfield rx_q{Q}_tc_map[2:0] */
#define HW_ATL2_RX_Q_TC_MAP_DEFAULT 0x0
+/* tx tx_tc_q_rand_map_en bitfield definitions
+ * preprocessor definitions for the bitfield "tx_tc_q_rand_map_en".
+ * port="pif_tpb_tx_tc_q_rand_map_en_i"
+ */
+
+/* register address for bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_ADR 0x00007900
+/* bitmask for bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSK 0x00000200
+/* inverted bitmask for bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSKN 0xFFFFFDFF
+/* lower bit position of bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_SHIFT 9
+/* width of bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_WIDTH 1
+/* default value of bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_DEFAULT 0x0
+
/* tx tx_buffer_clk_gate_en bitfield definitions
* preprocessor definitions for the bitfield "tx_buffer_clk_gate_en".
* port="pif_tpb_tx_buffer_clk_gate_en_i"
/* default value of bitfield tx_buffer_clk_gate_en */
#define HW_ATL2_TPB_TX_BUF_CLK_GATE_EN_DEFAULT 0x0
-/* tx data_tc{t}_credit_max[b:0] bitfield definitions
- * preprocessor definitions for the bitfield "data_tc{t}_credit_max[b:0]".
+/* tx tx_q_tc_map{q} bitfield definitions
+ * preprocessor definitions for the bitfield "tx_q_tc_map{q}".
+ * parameter: queue {q} | bit-level stride | range [0, 31]
+ * port="pif_tpb_tx_q_tc_map0_i[2:0]"
+ */
+
+/* register address for bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_ADR(queue) \
+ (((queue) < 32) ? 0x0000799C + ((queue) / 4) * 4 : 0)
+/* lower bit position of bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_SHIFT(queue) \
+ (((queue) < 32) ? ((queue) * 8) % 32 : 0)
+/* width of bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_WIDTH 3
+/* default value of bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_DEFAULT 0x0
+
+/* tx data_tc_arb_mode bitfield definitions
+ * preprocessor definitions for the bitfield "data_tc_arb_mode".
+ * port="pif_tps_data_tc_arb_mode_i"
+ */
+
+/* register address for bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_ADR 0x00007100
+/* bitmask for bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_MSK 0x00000003
+/* inverted bitmask for bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_MSKN 0xfffffffc
+/* lower bit position of bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_SHIFT 0
+/* width of bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_WIDTH 2
+/* default value of bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_DEFAULT 0x0
+
+/* tx data_tc{t}_credit_max[f:0] bitfield definitions
+ * preprocessor definitions for the bitfield "data_tc{t}_credit_max[f:0]".
* parameter: tc {t} | stride size 0x4 | range [0, 7]
- * port="pif_tps_data_tc0_credit_max_i[11:0]"
+ * port="pif_tps_data_tc0_credit_max_i[15:0]"
*/
-/* register address for bitfield data_tc{t}_credit_max[b:0] */
+/* register address for bitfield data_tc{t}_credit_max[f:0] */
#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_ADR(tc) (0x00007110 + (tc) * 0x4)
-/* bitmask for bitfield data_tc{t}_credit_max[b:0] */
-#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK 0x0fff0000
-/* inverted bitmask for bitfield data_tc{t}_credit_max[b:0] */
-#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSKN 0xf000ffff
-/* lower bit position of bitfield data_tc{t}_credit_max[b:0] */
+/* bitmask for bitfield data_tc{t}_credit_max[f:0] */
+#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK 0xffff0000
+/* inverted bitmask for bitfield data_tc{t}_credit_max[f:0] */
+#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSKN 0x0000ffff
+/* lower bit position of bitfield data_tc{t}_credit_max[f:0] */
#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_SHIFT 16
-/* width of bitfield data_tc{t}_credit_max[b:0] */
-#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_WIDTH 12
-/* default value of bitfield data_tc{t}_credit_max[b:0] */
+/* width of bitfield data_tc{t}_credit_max[f:0] */
+#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_WIDTH 16
+/* default value of bitfield data_tc{t}_credit_max[f:0] */
#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_DEFAULT 0x0
-/* tx data_tc{t}_weight[8:0] bitfield definitions
- * preprocessor definitions for the bitfield "data_tc{t}_weight[8:0]".
+/* tx data_tc{t}_weight[e:0] bitfield definitions
+ * preprocessor definitions for the bitfield "data_tc{t}_weight[e:0]".
* parameter: tc {t} | stride size 0x4 | range [0, 7]
- * port="pif_tps_data_tc0_weight_i[8:0]"
+ * port="pif_tps_data_tc0_weight_i[14:0]"
*/
-/* register address for bitfield data_tc{t}_weight[8:0] */
+/* register address for bitfield data_tc{t}_weight[e:0] */
#define HW_ATL2_TPS_DATA_TCTWEIGHT_ADR(tc) (0x00007110 + (tc) * 0x4)
-/* bitmask for bitfield data_tc{t}_weight[8:0] */
-#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSK 0x000001ff
-/* inverted bitmask for bitfield data_tc{t}_weight[8:0] */
-#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSKN 0xfffffe00
-/* lower bit position of bitfield data_tc{t}_weight[8:0] */
+/* bitmask for bitfield data_tc{t}_weight[e:0] */
+#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSK 0x00007fff
+/* inverted bitmask for bitfield data_tc{t}_weight[e:0] */
+#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSKN 0xffff8000
+/* lower bit position of bitfield data_tc{t}_weight[e:0] */
#define HW_ATL2_TPS_DATA_TCTWEIGHT_SHIFT 0
-/* width of bitfield data_tc{t}_weight[8:0] */
-#define HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH 9
-/* default value of bitfield data_tc{t}_weight[8:0] */
+/* width of bitfield data_tc{t}_weight[e:0] */
+#define HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH 15
+/* default value of bitfield data_tc{t}_weight[e:0] */
#define HW_ATL2_TPS_DATA_TCTWEIGHT_DEFAULT 0x0
/* tx interrupt moderation control register definitions
#define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
extern struct list_head adapter_list;
+extern struct list_head uld_list;
extern struct mutex uld_mutex;
/* Suspend an Ethernet Tx queue with fewer available descriptors than this.
u16 ntxq; /* # of egress uld queues */
};
+/* struct to maintain ULD list to reallocate ULD resources on hotplug */
+struct cxgb4_uld_list {
+ struct cxgb4_uld_info uld_info;
+ struct list_head list_node;
+ enum cxgb4_uld uld_type;
+};
+
enum sge_eosw_state {
CXGB4_EO_STATE_CLOSED = 0, /* Not ready to accept traffic */
CXGB4_EO_STATE_FLOWC_OPEN_SEND, /* Send FLOWC open request */
LIST_HEAD(adapter_list);
DEFINE_MUTEX(uld_mutex);
+LIST_HEAD(uld_list);
static int cfg_queues(struct adapter *adap);
/* PCIe EEH recovery on powerpc platforms needs fundamental reset */
pdev->needs_freset = 1;
- if (is_uld(adapter)) {
- mutex_lock(&uld_mutex);
- list_add_tail(&adapter->list_node, &adapter_list);
- mutex_unlock(&uld_mutex);
- }
+ if (is_uld(adapter))
+ cxgb4_uld_enable(adapter);
if (!is_t4(adapter->params.chip))
cxgb4_ptp_init(adapter);
}
#endif
+static void cxgb4_uld_alloc_resources(struct adapter *adap,
+ enum cxgb4_uld type,
+ const struct cxgb4_uld_info *p)
+{
+ int ret = 0;
+
+ if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) ||
+ (type != CXGB4_ULD_CRYPTO && !is_offload(adap)))
+ return;
+ if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip))
+ return;
+ ret = cfg_queues_uld(adap, type, p);
+ if (ret)
+ goto out;
+ ret = setup_sge_queues_uld(adap, type, p->lro);
+ if (ret)
+ goto free_queues;
+ if (adap->flags & CXGB4_USING_MSIX) {
+ ret = request_msix_queue_irqs_uld(adap, type);
+ if (ret)
+ goto free_rxq;
+ }
+ if (adap->flags & CXGB4_FULL_INIT_DONE)
+ enable_rx_uld(adap, type);
+#ifdef CONFIG_CHELSIO_TLS_DEVICE
+ /* send mbox to enable ktls related settings. */
+ if (type == CXGB4_ULD_CRYPTO &&
+ (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW))
+ cxgb4_set_ktls_feature(adap, 1);
+#endif
+ if (adap->uld[type].add)
+ goto free_irq;
+ ret = setup_sge_txq_uld(adap, type, p);
+ if (ret)
+ goto free_irq;
+ adap->uld[type] = *p;
+ ret = uld_attach(adap, type);
+ if (ret)
+ goto free_txq;
+ return;
+free_txq:
+ release_sge_txq_uld(adap, type);
+free_irq:
+ if (adap->flags & CXGB4_FULL_INIT_DONE)
+ quiesce_rx_uld(adap, type);
+ if (adap->flags & CXGB4_USING_MSIX)
+ free_msix_queue_irqs_uld(adap, type);
+free_rxq:
+ free_sge_queues_uld(adap, type);
+free_queues:
+ free_queues_uld(adap, type);
+out:
+ dev_warn(adap->pdev_dev,
+ "ULD registration failed for uld type %d\n", type);
+}
+
+void cxgb4_uld_enable(struct adapter *adap)
+{
+ struct cxgb4_uld_list *uld_entry;
+
+ mutex_lock(&uld_mutex);
+ list_add_tail(&adap->list_node, &adapter_list);
+ list_for_each_entry(uld_entry, &uld_list, list_node)
+ cxgb4_uld_alloc_resources(adap, uld_entry->uld_type,
+ &uld_entry->uld_info);
+ mutex_unlock(&uld_mutex);
+}
+
/* cxgb4_register_uld - register an upper-layer driver
* @type: the ULD type
* @p: the ULD methods
void cxgb4_register_uld(enum cxgb4_uld type,
const struct cxgb4_uld_info *p)
{
+ struct cxgb4_uld_list *uld_entry;
struct adapter *adap;
- int ret = 0;
if (type >= CXGB4_ULD_MAX)
return;
+ uld_entry = kzalloc(sizeof(*uld_entry), GFP_KERNEL);
+ if (!uld_entry)
+ return;
+
+ memcpy(&uld_entry->uld_info, p, sizeof(struct cxgb4_uld_info));
mutex_lock(&uld_mutex);
- list_for_each_entry(adap, &adapter_list, list_node) {
- if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) ||
- (type != CXGB4_ULD_CRYPTO && !is_offload(adap)))
- continue;
- if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip))
- continue;
- ret = cfg_queues_uld(adap, type, p);
- if (ret)
- goto out;
- ret = setup_sge_queues_uld(adap, type, p->lro);
- if (ret)
- goto free_queues;
- if (adap->flags & CXGB4_USING_MSIX) {
- ret = request_msix_queue_irqs_uld(adap, type);
- if (ret)
- goto free_rxq;
- }
- if (adap->flags & CXGB4_FULL_INIT_DONE)
- enable_rx_uld(adap, type);
-#ifdef CONFIG_CHELSIO_TLS_DEVICE
- /* send mbox to enable ktls related settings. */
- if (type == CXGB4_ULD_CRYPTO &&
- (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW))
- cxgb4_set_ktls_feature(adap, 1);
-#endif
- if (adap->uld[type].add)
- goto free_irq;
- ret = setup_sge_txq_uld(adap, type, p);
- if (ret)
- goto free_irq;
- adap->uld[type] = *p;
- ret = uld_attach(adap, type);
- if (ret)
- goto free_txq;
- continue;
-free_txq:
- release_sge_txq_uld(adap, type);
-free_irq:
- if (adap->flags & CXGB4_FULL_INIT_DONE)
- quiesce_rx_uld(adap, type);
- if (adap->flags & CXGB4_USING_MSIX)
- free_msix_queue_irqs_uld(adap, type);
-free_rxq:
- free_sge_queues_uld(adap, type);
-free_queues:
- free_queues_uld(adap, type);
-out:
- dev_warn(adap->pdev_dev,
- "ULD registration failed for uld type %d\n", type);
- }
+ list_for_each_entry(adap, &adapter_list, list_node)
+ cxgb4_uld_alloc_resources(adap, type, p);
+
+ uld_entry->uld_type = type;
+ list_add_tail(&uld_entry->list_node, &uld_list);
mutex_unlock(&uld_mutex);
return;
}
*/
int cxgb4_unregister_uld(enum cxgb4_uld type)
{
+ struct cxgb4_uld_list *uld_entry, *tmp;
struct adapter *adap;
if (type >= CXGB4_ULD_MAX)
cxgb4_set_ktls_feature(adap, 0);
#endif
}
+
+ list_for_each_entry_safe(uld_entry, tmp, &uld_list, list_node) {
+ if (uld_entry->uld_type == type) {
+ list_del(&uld_entry->list_node);
+ kfree(uld_entry);
+ }
+ }
mutex_unlock(&uld_mutex);
return 0;
CXGB4_CONTROL_DB_DROP,
};
+struct adapter;
struct pci_dev;
struct l2t_data;
struct net_device;
int (*tx_handler)(struct sk_buff *skb, struct net_device *dev);
};
+void cxgb4_uld_enable(struct adapter *adap);
void cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p);
int cxgb4_unregister_uld(enum cxgb4_uld type);
int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb);
WARN_ON(in_interrupt());
while (test_and_set_bit(__E1000_RESETTING, &adapter->flags))
msleep(1);
- e1000_down(adapter);
- e1000_up(adapter);
+
+ /* only run the task if not already down */
+ if (!test_bit(__E1000_DOWN, &adapter->flags)) {
+ e1000_down(adapter);
+ e1000_up(adapter);
+ }
+
clear_bit(__E1000_RESETTING, &adapter->flags);
}
struct e1000_hw *hw = &adapter->hw;
int count = E1000_CHECK_RESET_COUNT;
- while (test_bit(__E1000_RESETTING, &adapter->flags) && count--)
+ while (test_and_set_bit(__E1000_RESETTING, &adapter->flags) && count--)
usleep_range(10000, 20000);
- WARN_ON(test_bit(__E1000_RESETTING, &adapter->flags));
+ WARN_ON(count < 0);
+
+ /* signal that we're down so that the reset task will no longer run */
+ set_bit(__E1000_DOWN, &adapter->flags);
+ clear_bit(__E1000_RESETTING, &adapter->flags);
+
e1000_down(adapter);
e1000_power_down_phy(adapter);
e1000_free_irq(adapter);
* so forcibly disable it.
*/
hw->dev_spec.ich8lan.ulp_state = e1000_ulp_state_unknown;
- e1000_disable_ulp_lpt_lp(hw, true);
+ ret_val = e1000_disable_ulp_lpt_lp(hw, true);
+ if (ret_val) {
+ e_warn("Failed to disable ULP\n");
+ goto out;
+ }
ret_val = hw->phy.ops.acquire(hw);
if (ret_val) {
{0, NULL}
};
+struct e1000e_me_supported {
+ u16 device_id; /* supported device ID */
+};
+
+static const struct e1000e_me_supported me_supported[] = {
+ {E1000_DEV_ID_PCH_LPT_I217_LM},
+ {E1000_DEV_ID_PCH_LPTLP_I218_LM},
+ {E1000_DEV_ID_PCH_I218_LM2},
+ {E1000_DEV_ID_PCH_I218_LM3},
+ {E1000_DEV_ID_PCH_SPT_I219_LM},
+ {E1000_DEV_ID_PCH_SPT_I219_LM2},
+ {E1000_DEV_ID_PCH_LBG_I219_LM3},
+ {E1000_DEV_ID_PCH_SPT_I219_LM4},
+ {E1000_DEV_ID_PCH_SPT_I219_LM5},
+ {E1000_DEV_ID_PCH_CNP_I219_LM6},
+ {E1000_DEV_ID_PCH_CNP_I219_LM7},
+ {E1000_DEV_ID_PCH_ICP_I219_LM8},
+ {E1000_DEV_ID_PCH_ICP_I219_LM9},
+ {E1000_DEV_ID_PCH_CMP_I219_LM10},
+ {E1000_DEV_ID_PCH_CMP_I219_LM11},
+ {E1000_DEV_ID_PCH_CMP_I219_LM12},
+ {E1000_DEV_ID_PCH_TGP_I219_LM13},
+ {E1000_DEV_ID_PCH_TGP_I219_LM14},
+ {E1000_DEV_ID_PCH_TGP_I219_LM15},
+ {0}
+};
+
+static bool e1000e_check_me(u16 device_id)
+{
+ struct e1000e_me_supported *id;
+
+ for (id = (struct e1000e_me_supported *)me_supported;
+ id->device_id; id++)
+ if (device_id == id->device_id)
+ return true;
+
+ return false;
+}
+
/**
* __ew32_prepare - prepare to write to MAC CSR register on certain parts
* @hw: pointer to the HW structure
/* oops */
break;
}
+ if (hw->mac.type == e1000_pch_spt) {
+ netdev->features &= ~NETIF_F_TSO;
+ netdev->features &= ~NETIF_F_TSO6;
+ }
}
/* enable transmits in the hardware, need to do this
e1000e_pm_thaw(dev);
/* Introduce S0ix implementation */
- if (hw->mac.type >= e1000_pch_cnp)
+ if (hw->mac.type >= e1000_pch_cnp &&
+ !e1000e_check_me(hw->adapter->pdev->device))
e1000e_s0ix_entry_flow(adapter);
return rc;
int rc;
/* Introduce S0ix implementation */
- if (hw->mac.type >= e1000_pch_cnp)
+ if (hw->mac.type >= e1000_pch_cnp &&
+ !e1000e_check_me(hw->adapter->pdev->device))
e1000e_s0ix_exit_flow(adapter);
rc = __e1000_resume(pdev);
#include "i40e_diag.h"
#include "i40e_xsk.h"
#include <net/udp_tunnel.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* All i40e tracepoints are defined by the include below, which
* must be included exactly once across the whole kernel with
* CREATE_TRACE_POINTS defined
if (ring->vsi->type == I40E_VSI_MAIN)
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
+ kfree(ring->rx_bi);
ring->xsk_umem = i40e_xsk_umem(ring);
if (ring->xsk_umem) {
- ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ ret = i40e_alloc_rx_bi_zc(ring);
+ if (ret)
+ return ret;
+ ring->rx_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
/* For AF_XDP ZC, we disallow packets to span on
* multiple buffers, thus letting us skip that
* handling in the fast-path.
*/
chain_len = 1;
- ring->zca.free = i40e_zca_free;
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &ring->zca);
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL);
if (ret)
return ret;
dev_info(&vsi->back->pdev->dev,
- "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+ "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->queue_index);
} else {
+ ret = i40e_alloc_rx_bi(ring);
+ if (ret)
+ return ret;
ring->rx_buf_len = vsi->rx_buf_len;
if (ring->vsi->type == I40E_VSI_MAIN) {
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
writel(0, ring->tail);
- ok = ring->xsk_umem ?
- i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) :
- !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+ if (ring->xsk_umem) {
+ xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
+ ok = i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring));
+ } else {
+ ok = !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+ }
if (!ok) {
/* Log this in case the user has forgotten to give the kernel
* any buffers, even later in the application.
/**
* i40e_fd_handle_status - check the Programming Status for FD
* @rx_ring: the Rx ring for this descriptor
- * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
+ * @qword0_raw: qword0
+ * @qword1: qword1 after le_to_cpu
* @prog_id: the id originally used for programming
*
* This is used to verify if the FD programming or invalidation
* requested by SW to the HW is successful or not and take actions accordingly.
**/
-void i40e_fd_handle_status(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc, u8 prog_id)
+static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1, u8 prog_id)
{
struct i40e_pf *pf = rx_ring->vsi->back;
struct pci_dev *pdev = pf->pdev;
+ struct i40e_32b_rx_wb_qw0 *qw0;
u32 fcnt_prog, fcnt_avail;
u32 error;
- u64 qw;
- qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
- error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
+ qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw;
+ error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
- pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
- if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
+ pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id);
+ if (qw0->hi_dword.fd_id != 0 ||
(I40E_DEBUG_FD & pf->hw.debug_mask))
dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
pf->fd_inv);
/* store the current atr filter count */
pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
- if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
+ if (qw0->hi_dword.fd_id == 0 &&
test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) {
/* These set_bit() calls aren't atomic with the
* test_bit() here, but worse case we potentially
} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
if (I40E_DEBUG_FD & pf->hw.debug_mask)
dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
- rx_desc->wb.qword0.hi_dword.fd_id);
+ qw0->hi_dword.fd_id);
}
}
rc->total_packets = 0;
}
+static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
+{
+ return &rx_ring->rx_bi[idx];
+}
+
/**
* i40e_reuse_rx_page - page flip buffer and store it back on the ring
* @rx_ring: rx descriptor ring to store buffers on
struct i40e_rx_buffer *new_buff;
u16 nta = rx_ring->next_to_alloc;
- new_buff = &rx_ring->rx_bi[nta];
+ new_buff = i40e_rx_bi(rx_ring, nta);
/* update, and store next to alloc */
nta++;
}
/**
- * i40e_rx_is_programming_status - check for programming status descriptor
- * @qw: qword representing status_error_len in CPU ordering
- *
- * The value of in the descriptor length field indicate if this
- * is a programming status descriptor for flow director or FCoE
- * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
- * it is a packet descriptor.
- **/
-static inline bool i40e_rx_is_programming_status(u64 qw)
-{
- /* The Rx filter programming status and SPH bit occupy the same
- * spot in the descriptor. Since we don't support packet split we
- * can just reuse the bit as an indication that this is a
- * programming status descriptor.
- */
- return qw & I40E_RXD_QW1_LENGTH_SPH_MASK;
-}
-
-/**
- * i40e_clean_programming_status - try clean the programming status descriptor
+ * i40e_clean_programming_status - clean the programming status descriptor
* @rx_ring: the rx ring that has this descriptor
- * @rx_desc: the rx descriptor written back by HW
- * @qw: qword representing status_error_len in CPU ordering
+ * @qword0_raw: qword0
+ * @qword1: qword1 representing status_error_len in CPU ordering
*
* Flow director should handle FD_FILTER_STATUS to check its filter programming
* status being successful or not and take actions accordingly. FCoE should
*
* Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL.
**/
-struct i40e_rx_buffer *i40e_clean_programming_status(
- struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc,
- u64 qw)
+void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1)
{
- struct i40e_rx_buffer *rx_buffer;
- u32 ntc;
u8 id;
- if (!i40e_rx_is_programming_status(qw))
- return NULL;
-
- ntc = rx_ring->next_to_clean;
-
- /* fetch, update, and store next to clean */
- rx_buffer = &rx_ring->rx_bi[ntc++];
- ntc = (ntc < rx_ring->count) ? ntc : 0;
- rx_ring->next_to_clean = ntc;
-
- prefetch(I40E_RX_DESC(rx_ring, ntc));
-
- id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
+ id = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
- i40e_fd_handle_status(rx_ring, rx_desc, id);
-
- return rx_buffer;
+ i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id);
}
/**
return -ENOMEM;
}
+int i40e_alloc_rx_bi(struct i40e_ring *rx_ring)
+{
+ unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count;
+
+ rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL);
+ return rx_ring->rx_bi ? 0 : -ENOMEM;
+}
+
+static void i40e_clear_rx_bi(struct i40e_ring *rx_ring)
+{
+ memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count);
+}
+
/**
* i40e_clean_rx_ring - Free Rx buffers
* @rx_ring: ring to be cleaned
**/
void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
{
- unsigned long bi_size;
u16 i;
/* ring already cleared, nothing to do */
/* Free all the Rx ring sk_buffs */
for (i = 0; i < rx_ring->count; i++) {
- struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+ struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i);
if (!rx_bi->page)
continue;
}
skip_free:
- bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
- memset(rx_ring->rx_bi, 0, bi_size);
+ if (rx_ring->xsk_umem)
+ i40e_clear_rx_bi_zc(rx_ring);
+ else
+ i40e_clear_rx_bi(rx_ring);
/* Zero out the descriptor ring */
memset(rx_ring->desc, 0, rx_ring->size);
int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
- int err = -ENOMEM;
- int bi_size;
-
- /* warn if we are about to overwrite the pointer */
- WARN_ON(rx_ring->rx_bi);
- bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
- rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
- if (!rx_ring->rx_bi)
- goto err;
+ int err;
u64_stats_init(&rx_ring->syncp);
if (!rx_ring->desc) {
dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
rx_ring->size);
- goto err;
+ return -ENOMEM;
}
rx_ring->next_to_alloc = 0;
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
rx_ring->queue_index);
if (err < 0)
- goto err;
+ return err;
}
rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
return 0;
-err:
- kfree(rx_ring->rx_bi);
- rx_ring->rx_bi = NULL;
- return err;
}
/**
return false;
rx_desc = I40E_RX_DESC(rx_ring, ntu);
- bi = &rx_ring->rx_bi[ntu];
+ bi = i40e_rx_bi(rx_ring, ntu);
do {
if (!i40e_alloc_mapped_page(rx_ring, bi))
ntu++;
if (unlikely(ntu == rx_ring->count)) {
rx_desc = I40E_RX_DESC(rx_ring, 0);
- bi = rx_ring->rx_bi;
+ bi = i40e_rx_bi(rx_ring, 0);
ntu = 0;
}
{
struct i40e_rx_buffer *rx_buffer;
- rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
prefetchw(rx_buffer->page);
/* we are reusing so sync this buffer for CPU use */
*/
dma_rmb();
- rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc,
- qword);
- if (unlikely(rx_buffer)) {
+ if (i40e_rx_is_programming_status(qword)) {
+ i40e_clean_programming_status(rx_ring,
+ rx_desc->raw.qword[0],
+ qword);
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ i40e_inc_ntc(rx_ring);
i40e_reuse_rx_page(rx_ring, rx_buffer);
cleaned_count++;
continue;
struct i40e_rx_buffer {
dma_addr_t dma;
- union {
- struct {
- struct page *page;
- __u32 page_offset;
- __u16 pagecnt_bias;
- };
- struct {
- void *addr;
- u64 handle;
- };
- };
+ struct page *page;
+ __u32 page_offset;
+ __u16 pagecnt_bias;
};
struct i40e_queue_stats {
union {
struct i40e_tx_buffer *tx_bi;
struct i40e_rx_buffer *rx_bi;
+ struct xdp_buff **rx_bi_zc;
};
DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
u16 queue_index; /* Queue number of ring */
struct i40e_channel *ch;
struct xdp_rxq_info xdp_rxq;
struct xdp_umem *xsk_umem;
- struct zero_copy_allocator zca; /* ZC allocator anchor */
} ____cacheline_internodealigned_in_smp;
static inline bool ring_uses_build_skb(struct i40e_ring *ring)
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
+int i40e_alloc_rx_bi(struct i40e_ring *rx_ring);
/**
* i40e_get_head - Retrieve head from head writeback
#ifndef I40E_TXRX_COMMON_
#define I40E_TXRX_COMMON_
-void i40e_fd_handle_status(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc, u8 prog_id);
int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring);
-struct i40e_rx_buffer *i40e_clean_programming_status(
- struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc,
- u64 qw);
+void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+ u64 qword1);
void i40e_process_skb_fields(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, struct sk_buff *skb);
void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring);
}
}
+/**
+ * i40e_rx_is_programming_status - check for programming status descriptor
+ * @qword1: qword1 representing status_error_len in CPU ordering
+ *
+ * The value of in the descriptor length field indicate if this
+ * is a programming status descriptor for flow director or FCoE
+ * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
+ * it is a packet descriptor.
+ **/
+static inline bool i40e_rx_is_programming_status(u64 qword1)
+{
+ /* The Rx filter programming status and SPH bit occupy the same
+ * spot in the descriptor. Since we don't support packet split we
+ * can just reuse the bit as an indication that this is a
+ * programming status descriptor.
+ */
+ return qword1 & I40E_RXD_QW1_LENGTH_SPH_MASK;
+}
+
+/**
+ * i40e_inc_ntc: Advance the next_to_clean index
+ * @rx_ring: Rx ring
+ **/
+static inline void i40e_inc_ntc(struct i40e_ring *rx_ring)
+{
+ u32 ntc = rx_ring->next_to_clean + 1;
+
+ ntc = (ntc < rx_ring->count) ? ntc : 0;
+ rx_ring->next_to_clean = ntc;
+ prefetch(I40E_RX_DESC(rx_ring, ntc));
+}
+
void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);
__le64 rsvd2;
} read;
struct {
- struct {
+ struct i40e_32b_rx_wb_qw0 {
struct {
union {
__le16 mirroring_status;
} hi_dword;
} qword3;
} wb; /* writeback */
+ struct {
+ u64 qword[4];
+ } raw;
};
enum i40e_rx_desc_status_bits {
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "i40e.h"
#include "i40e_txrx_common.h"
#include "i40e_xsk.h"
-/**
- * i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev
- * @vsi: Current VSI
- * @umem: UMEM to DMA map
- *
- * Returns 0 on success, <0 on failure
- **/
-static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem)
+int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring)
{
- struct i40e_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i, j;
- dma_addr_t dma;
-
- dev = &pf->pdev->dev;
- for (i = 0; i < umem->npgs; i++) {
- dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
- DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
- if (dma_mapping_error(dev, dma))
- goto out_unmap;
+ unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count;
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-out_unmap:
- for (j = 0; j < i; j++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
- umem->pages[i].dma = 0;
- }
-
- return -1;
+ rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL);
+ return rx_ring->rx_bi_zc ? 0 : -ENOMEM;
}
-/**
- * i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev
- * @vsi: Current VSI
- * @umem: UMEM to DMA map
- **/
-static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem)
+void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring)
{
- struct i40e_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i;
-
- dev = &pf->pdev->dev;
-
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
+ memset(rx_ring->rx_bi_zc, 0,
+ sizeof(*rx_ring->rx_bi_zc) * rx_ring->count);
+}
- umem->pages[i].dma = 0;
- }
+static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
+{
+ return &rx_ring->rx_bi_zc[idx];
}
/**
u16 qid)
{
struct net_device *netdev = vsi->netdev;
- struct xdp_umem_fq_reuse *reuseq;
bool if_running;
int err;
qid >= netdev->real_num_tx_queues)
return -EINVAL;
- reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
- if (!reuseq)
- return -ENOMEM;
-
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- err = i40e_xsk_umem_dma_map(vsi, umem);
+ err = xsk_buff_dma_map(umem, &vsi->back->pdev->dev, I40E_RX_DMA_ATTR);
if (err)
return err;
}
clear_bit(qid, vsi->af_xdp_zc_qps);
- i40e_xsk_umem_dma_unmap(vsi, umem);
+ xsk_buff_dma_unmap(umem, I40E_RX_DMA_ATTR);
if (if_running) {
err = i40e_queue_pair_enable(vsi, qid);
**/
static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
{
- struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = I40E_XDP_PASS;
struct i40e_ring *xdp_ring;
struct bpf_prog *xdp_prog;
- u64 offset;
u32 act;
rcu_read_lock();
*/
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
- offset = xdp->data - xdp->data_hard_start;
-
- xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
return result;
}
-/**
- * i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer
- * @rx_ring: Rx ring
- * @bi: Rx buffer to populate
- *
- * This function allocates an Rx buffer. The buffer can come from fill
- * queue, or via the recycle queue (next_to_alloc).
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- void *addr = bi->addr;
- u64 handle, hr;
-
- if (addr) {
- rx_ring->rx_stats.page_reuse_count++;
- return true;
- }
-
- if (!xsk_umem_peek_addr(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr(umem);
- return true;
-}
-
-/**
- * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer
- * @rx_ring: Rx ring
- * @bi: Rx buffer to populate
- *
- * This function allocates an Rx buffer. The buffer can come from fill
- * queue, or via the reuse queue.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- u64 handle, hr;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- handle &= rx_ring->xsk_umem->chunk_mask;
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr_rq(umem);
- return true;
-}
-
-static __always_inline bool
-__i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count,
- bool alloc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi))
+bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
{
u16 ntu = rx_ring->next_to_use;
union i40e_rx_desc *rx_desc;
- struct i40e_rx_buffer *bi;
+ struct xdp_buff **bi, *xdp;
+ dma_addr_t dma;
bool ok = true;
rx_desc = I40E_RX_DESC(rx_ring, ntu);
- bi = &rx_ring->rx_bi[ntu];
+ bi = i40e_rx_bi(rx_ring, ntu);
do {
- if (!alloc(rx_ring, bi)) {
+ xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+ if (!xdp) {
ok = false;
goto no_buffers;
}
-
- dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0,
- rx_ring->rx_buf_len,
- DMA_BIDIRECTIONAL);
-
- rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+ *bi = xdp;
+ dma = xsk_buff_xdp_get_dma(xdp);
+ rx_desc->read.pkt_addr = cpu_to_le64(dma);
+ rx_desc->read.hdr_addr = 0;
rx_desc++;
bi++;
if (unlikely(ntu == rx_ring->count)) {
rx_desc = I40E_RX_DESC(rx_ring, 0);
- bi = rx_ring->rx_bi;
+ bi = i40e_rx_bi(rx_ring, 0);
ntu = 0;
}
- rx_desc->wb.qword1.status_error_len = 0;
count--;
} while (count);
return ok;
}
-/**
- * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
- * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
- *
- * This function allocates a number of Rx buffers from the reuse queue
- * or fill ring and places them on the Rx ring.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
-{
- return __i40e_alloc_rx_buffers_zc(rx_ring, count,
- i40e_alloc_buffer_slow_zc);
-}
-
-/**
- * i40e_alloc_rx_buffers_fast_zc - Allocates a number of Rx buffers
- * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
- *
- * This function allocates a number of Rx buffers from the fill ring
- * or the internal recycle mechanism and places them on the Rx ring.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_rx_buffers_fast_zc(struct i40e_ring *rx_ring, u16 count)
-{
- return __i40e_alloc_rx_buffers_zc(rx_ring, count,
- i40e_alloc_buffer_zc);
-}
-
-/**
- * i40e_get_rx_buffer_zc - Return the current Rx buffer
- * @rx_ring: Rx ring
- * @size: The size of the rx buffer (read from descriptor)
- *
- * This function returns the current, received Rx buffer, and also
- * does DMA synchronization. the Rx ring.
- *
- * Returns the received Rx buffer
- **/
-static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring,
- const unsigned int size)
-{
- struct i40e_rx_buffer *bi;
-
- bi = &rx_ring->rx_bi[rx_ring->next_to_clean];
-
- /* we are reusing so sync this buffer for CPU use */
- dma_sync_single_range_for_cpu(rx_ring->dev,
- bi->dma, 0,
- size,
- DMA_BIDIRECTIONAL);
-
- return bi;
-}
-
-/**
- * i40e_reuse_rx_buffer_zc - Recycle an Rx buffer
- * @rx_ring: Rx ring
- * @old_bi: The Rx buffer to recycle
- *
- * This function recycles a finished Rx buffer, and places it on the
- * recycle queue (next_to_alloc).
- **/
-static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *old_bi)
-{
- struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
- u16 nta = rx_ring->next_to_alloc;
-
- /* update, and store next to alloc */
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- /* transfer page from old buffer to new buffer */
- new_bi->dma = old_bi->dma;
- new_bi->addr = old_bi->addr;
- new_bi->handle = old_bi->handle;
-
- old_bi->addr = NULL;
-}
-
-/**
- * i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations
- * @alloc: Zero-copy allocator
- * @handle: Buffer handle
- **/
-void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
-{
- struct i40e_rx_buffer *bi;
- struct i40e_ring *rx_ring;
- u64 hr, mask;
- u16 nta;
-
- rx_ring = container_of(alloc, struct i40e_ring, zca);
- hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
- mask = rx_ring->xsk_umem->chunk_mask;
-
- nta = rx_ring->next_to_alloc;
- bi = &rx_ring->rx_bi[nta];
-
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- handle &= mask;
-
- bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
- rx_ring->xsk_umem->headroom);
-}
-
/**
* i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer
* @rx_ring: Rx ring
- * @bi: Rx buffer
* @xdp: xdp_buff
*
* This functions allocates a new skb from a zero-copy Rx buffer.
* Returns the skb, or NULL on failure.
**/
static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *bi,
struct xdp_buff *xdp)
{
unsigned int metasize = xdp->data - xdp->data_meta;
if (metasize)
skb_metadata_set(skb, metasize);
- i40e_reuse_rx_buffer_zc(rx_ring, bi);
+ xsk_buff_free(xdp);
return skb;
}
-/**
- * i40e_inc_ntc: Advance the next_to_clean index
- * @rx_ring: Rx ring
- **/
-static void i40e_inc_ntc(struct i40e_ring *rx_ring)
-{
- u32 ntc = rx_ring->next_to_clean + 1;
-
- ntc = (ntc < rx_ring->count) ? ntc : 0;
- rx_ring->next_to_clean = ntc;
- prefetch(I40E_RX_DESC(rx_ring, ntc));
-}
-
/**
* i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
* @rx_ring: Rx ring
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
- struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
- struct xdp_buff xdp;
-
- xdp.rxq = &rx_ring->xdp_rxq;
- xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
- struct i40e_rx_buffer *bi;
union i40e_rx_desc *rx_desc;
+ struct xdp_buff **bi;
unsigned int size;
u64 qword;
if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
failure = failure ||
- !i40e_alloc_rx_buffers_fast_zc(rx_ring,
- cleaned_count);
+ !i40e_alloc_rx_buffers_zc(rx_ring,
+ cleaned_count);
cleaned_count = 0;
}
*/
dma_rmb();
- bi = i40e_clean_programming_status(rx_ring, rx_desc,
- qword);
- if (unlikely(bi)) {
- i40e_reuse_rx_buffer_zc(rx_ring, bi);
+ if (i40e_rx_is_programming_status(qword)) {
+ i40e_clean_programming_status(rx_ring,
+ rx_desc->raw.qword[0],
+ qword);
+ bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ xsk_buff_free(*bi);
+ *bi = NULL;
cleaned_count++;
+ i40e_inc_ntc(rx_ring);
continue;
}
+ bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
if (!size)
break;
- bi = i40e_get_rx_buffer_zc(rx_ring, size);
- xdp.data = bi->addr;
- xdp.data_meta = xdp.data;
- xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
- xdp.data_end = xdp.data + size;
- xdp.handle = bi->handle;
+ bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ (*bi)->data_end = (*bi)->data + size;
+ xsk_buff_dma_sync_for_cpu(*bi);
- xdp_res = i40e_run_xdp_zc(rx_ring, &xdp);
+ xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
if (xdp_res) {
- if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
+ if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR))
xdp_xmit |= xdp_res;
- bi->addr = NULL;
- } else {
- i40e_reuse_rx_buffer_zc(rx_ring, bi);
- }
+ else
+ xsk_buff_free(*bi);
+ *bi = NULL;
total_rx_bytes += size;
total_rx_packets++;
* BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that
* SBP is *not* set in PRT_SBPVSI (default not set).
*/
- skb = i40e_construct_skb_zc(rx_ring, bi, &xdp);
+ skb = i40e_construct_skb_zc(rx_ring, *bi);
+ *bi = NULL;
if (!skb) {
rx_ring->rx_stats.alloc_buff_failed++;
break;
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
- dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+ xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+ desc.len);
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;
u16 i;
for (i = 0; i < rx_ring->count; i++) {
- struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+ struct xdp_buff *rx_bi = *i40e_rx_bi(rx_ring, i);
- if (!rx_bi->addr)
+ if (!rx_bi)
continue;
- xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_bi->handle);
- rx_bi->addr = NULL;
+ xsk_buff_free(rx_bi);
+ rx_bi = NULL;
}
}
int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair);
int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
u16 qid);
-void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
struct i40e_ring *tx_ring, int napi_budget);
int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
+int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring);
+void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring);
#endif /* _I40E_XSK_H_ */
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019, Intel Corporation. */
+#include <net/xdp_sock_drv.h>
#include "ice_base.h"
#include "ice_dcb_lib.h"
if (ring->xsk_umem) {
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
- ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ ring->rx_buf_len =
+ xsk_umem_get_rx_frame_size(ring->xsk_umem);
/* For AF_XDP ZC, we disallow packets to span on
* multiple buffers, thus letting us skip that
* handling in the fast-path.
*/
chain_len = 1;
- ring->zca.free = ice_zca_free;
err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &ring->zca);
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL);
if (err)
return err;
+ xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
- dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+ dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->q_index);
} else {
- ring->zca.free = NULL;
if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
/* coverity[check_return] */
xdp_rxq_info_reg(&ring->xdp_rxq,
writel(0, ring->tail);
err = ring->xsk_umem ?
- ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) :
+ ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring)) :
ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring));
if (err)
dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n",
};
struct ice_rx_buf {
- struct sk_buff *skb;
- dma_addr_t dma;
union {
struct {
+ struct sk_buff *skb;
+ dma_addr_t dma;
struct page *page;
unsigned int page_offset;
u16 pagecnt_bias;
};
struct {
- void *addr;
- u64 handle;
+ struct xdp_buff *xdp;
};
};
};
struct rcu_head rcu; /* to avoid race on free */
struct bpf_prog *xdp_prog;
struct xdp_umem *xsk_umem;
- struct zero_copy_allocator zca;
/* CL3 - 3rd cacheline starts here */
struct xdp_rxq_info xdp_rxq;
/* CLX - the below items are only accessed infrequently and should be
/* Copyright (c) 2019, Intel Corporation. */
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ice.h"
#include "ice_base.h"
return 0;
}
-/**
- * ice_xsk_add_umem - add a UMEM region for XDP sockets
- * @vsi: VSI to which the UMEM will be added
- * @umem: pointer to a requested UMEM region
- * @qid: queue ID
- *
- * Returns 0 on success, negative on error
- */
-static int ice_xsk_add_umem(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
-{
- int err;
-
- err = ice_xsk_alloc_umems(vsi);
- if (err)
- return err;
-
- vsi->xsk_umems[qid] = umem;
- vsi->num_xsk_umems_used++;
-
- return 0;
-}
-
/**
* ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid
* @vsi: VSI from which the VSI will be removed
}
}
-/**
- * ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets
- * @vsi: VSI to map the UMEM region
- * @umem: UMEM to map
- *
- * Returns 0 on success, negative on error
- */
-static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem)
-{
- struct ice_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i;
-
- dev = ice_pf_to_dev(pf);
- for (i = 0; i < umem->npgs; i++) {
- dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0,
- PAGE_SIZE,
- DMA_BIDIRECTIONAL,
- ICE_RX_DMA_ATTR);
- if (dma_mapping_error(dev, dma)) {
- dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d\n",
- i);
- goto out_unmap;
- }
-
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-out_unmap:
- for (; i > 0; i--) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
- umem->pages[i].dma = 0;
- }
-
- return -EFAULT;
-}
-
-/**
- * ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets
- * @vsi: VSI from which the UMEM will be unmapped
- * @umem: UMEM to unmap
- */
-static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem)
-{
- struct ice_pf *pf = vsi->back;
- struct device *dev;
- unsigned int i;
-
- dev = ice_pf_to_dev(pf);
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
-
- umem->pages[i].dma = 0;
- }
-}
/**
* ice_xsk_umem_disable - disable a UMEM region
!vsi->xsk_umems[qid])
return -EINVAL;
- ice_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]);
+ xsk_buff_dma_unmap(vsi->xsk_umems[qid], ICE_RX_DMA_ATTR);
ice_xsk_remove_umem(vsi, qid);
return 0;
static int
ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
{
- struct xdp_umem_fq_reuse *reuseq;
int err;
if (vsi->type != ICE_VSI_PF)
if (qid >= vsi->num_xsk_umems)
return -EINVAL;
+ err = ice_xsk_alloc_umems(vsi);
+ if (err)
+ return err;
+
if (vsi->xsk_umems && vsi->xsk_umems[qid])
return -EBUSY;
- reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
- if (!reuseq)
- return -ENOMEM;
-
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- err = ice_xsk_umem_dma_map(vsi, umem);
- if (err)
- return err;
+ vsi->xsk_umems[qid] = umem;
+ vsi->num_xsk_umems_used++;
- err = ice_xsk_add_umem(vsi, umem, qid);
+ err = xsk_buff_dma_map(vsi->xsk_umems[qid], ice_pf_to_dev(vsi->back),
+ ICE_RX_DMA_ATTR);
if (err)
return err;
return ret;
}
-/**
- * ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations
- * @zca: zero-cpoy allocator
- * @handle: Buffer handle
- */
-void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
-{
- struct ice_rx_buf *rx_buf;
- struct ice_ring *rx_ring;
- struct xdp_umem *umem;
- u64 hr, mask;
- u16 nta;
-
- rx_ring = container_of(zca, struct ice_ring, zca);
- umem = rx_ring->xsk_umem;
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- mask = umem->chunk_mask;
-
- nta = rx_ring->next_to_alloc;
- rx_buf = &rx_ring->rx_buf[nta];
-
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- handle &= mask;
-
- rx_buf->dma = xdp_umem_get_dma(umem, handle);
- rx_buf->dma += hr;
-
- rx_buf->addr = xdp_umem_get_data(umem, handle);
- rx_buf->addr += hr;
-
- rx_buf->handle = (u64)handle + umem->headroom;
-}
-
-/**
- * ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem
- * @rx_ring: ring with an xdp_umem bound to it
- * @rx_buf: buffer to which xsk page address will be assigned
- *
- * This function allocates an Rx buffer in the hot path.
- * The buffer can come from fill queue or recycle queue.
- *
- * Returns true if an assignment was successful, false if not.
- */
-static __always_inline bool
-ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- void *addr = rx_buf->addr;
- u64 handle, hr;
-
- if (addr) {
- rx_ring->rx_stats.page_reuse_count++;
- return true;
- }
-
- if (!xsk_umem_peek_addr(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- rx_buf->dma = xdp_umem_get_dma(umem, handle);
- rx_buf->dma += hr;
-
- rx_buf->addr = xdp_umem_get_data(umem, handle);
- rx_buf->addr += hr;
-
- rx_buf->handle = handle + umem->headroom;
-
- xsk_umem_release_addr(umem);
- return true;
-}
-
-/**
- * ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem
- * @rx_ring: ring with an xdp_umem bound to it
- * @rx_buf: buffer to which xsk page address will be assigned
- *
- * This function allocates an Rx buffer in the slow path.
- * The buffer can come from fill queue or recycle queue.
- *
- * Returns true if an assignment was successful, false if not.
- */
-static __always_inline bool
-ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- u64 handle, headroom;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle)) {
- rx_ring->rx_stats.alloc_page_failed++;
- return false;
- }
-
- handle &= umem->chunk_mask;
- headroom = umem->headroom + XDP_PACKET_HEADROOM;
-
- rx_buf->dma = xdp_umem_get_dma(umem, handle);
- rx_buf->dma += headroom;
-
- rx_buf->addr = xdp_umem_get_data(umem, handle);
- rx_buf->addr += headroom;
-
- rx_buf->handle = handle + umem->headroom;
-
- xsk_umem_release_addr_rq(umem);
- return true;
-}
-
/**
* ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
* @rx_ring: Rx ring
* @count: The number of buffers to allocate
- * @alloc: the function pointer to call for allocation
*
* This function allocates a number of Rx buffers from the fill ring
* or the internal recycle mechanism and places them on the Rx ring.
*
* Returns false if all allocations were successful, true if any fail.
*/
-static bool
-ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
- bool (*alloc)(struct ice_ring *, struct ice_rx_buf *))
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
{
union ice_32b_rx_flex_desc *rx_desc;
u16 ntu = rx_ring->next_to_use;
struct ice_rx_buf *rx_buf;
bool ret = false;
+ dma_addr_t dma;
if (!count)
return false;
rx_buf = &rx_ring->rx_buf[ntu];
do {
- if (!alloc(rx_ring, rx_buf)) {
+ rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+ if (!rx_buf->xdp) {
ret = true;
break;
}
- dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0,
- rx_ring->rx_buf_len,
- DMA_BIDIRECTIONAL);
-
- rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma);
+ dma = xsk_buff_xdp_get_dma(rx_buf->xdp);
+ rx_desc->read.pkt_addr = cpu_to_le64(dma);
rx_desc->wb.status_error0 = 0;
rx_desc++;
return ret;
}
-/**
- * ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path
- * @rx_ring: Rx ring
- * @count: number of bufs to allocate
- *
- * Returns false on success, true on failure.
- */
-static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count)
-{
- return ice_alloc_rx_bufs_zc(rx_ring, count,
- ice_alloc_buf_fast_zc);
-}
-
-/**
- * ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path
- * @rx_ring: Rx ring
- * @count: number of bufs to allocate
- *
- * Returns false on success, true on failure.
- */
-bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count)
-{
- return ice_alloc_rx_bufs_zc(rx_ring, count,
- ice_alloc_buf_slow_zc);
-}
-
/**
* ice_bump_ntc - Bump the next_to_clean counter of an Rx ring
* @rx_ring: Rx ring
prefetch(ICE_RX_DESC(rx_ring, ntc));
}
-/**
- * ice_get_rx_buf_zc - Fetch the current Rx buffer
- * @rx_ring: Rx ring
- * @size: size of a buffer
- *
- * This function returns the current, received Rx buffer and does
- * DMA synchronization.
- *
- * Returns a pointer to the received Rx buffer.
- */
-static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size)
-{
- struct ice_rx_buf *rx_buf;
-
- rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
-
- dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0,
- size, DMA_BIDIRECTIONAL);
-
- return rx_buf;
-}
-
-/**
- * ice_reuse_rx_buf_zc - reuse an Rx buffer
- * @rx_ring: Rx ring
- * @old_buf: The buffer to recycle
- *
- * This function recycles a finished Rx buffer, and places it on the recycle
- * queue (next_to_alloc).
- */
-static void
-ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf)
-{
- unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
- u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
- u16 nta = rx_ring->next_to_alloc;
- struct ice_rx_buf *new_buf;
-
- new_buf = &rx_ring->rx_buf[nta++];
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- new_buf->dma = old_buf->dma & mask;
- new_buf->dma += hr;
-
- new_buf->addr = (void *)((unsigned long)old_buf->addr & mask);
- new_buf->addr += hr;
-
- new_buf->handle = old_buf->handle & mask;
- new_buf->handle += rx_ring->xsk_umem->headroom;
-
- old_buf->addr = NULL;
-}
-
/**
* ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
* @rx_ring: Rx ring
* @rx_buf: zero-copy Rx buffer
- * @xdp: XDP buffer
*
* This function allocates a new skb from a zero-copy Rx buffer.
*
* Returns the skb on success, NULL on failure.
*/
static struct sk_buff *
-ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
- struct xdp_buff *xdp)
+ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
{
- unsigned int metasize = xdp->data - xdp->data_meta;
- unsigned int datasize = xdp->data_end - xdp->data;
- unsigned int datasize_hard = xdp->data_end -
- xdp->data_hard_start;
+ unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
+ unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
+ unsigned int datasize_hard = rx_buf->xdp->data_end -
+ rx_buf->xdp->data_hard_start;
struct sk_buff *skb;
skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
if (unlikely(!skb))
return NULL;
- skb_reserve(skb, xdp->data - xdp->data_hard_start);
- memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+ skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
+ memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
if (metasize)
skb_metadata_set(skb, metasize);
- ice_reuse_rx_buf_zc(rx_ring, rx_buf);
-
+ xsk_buff_free(rx_buf->xdp);
+ rx_buf->xdp = NULL;
return skb;
}
}
act = bpf_prog_run_xdp(xdp_prog, xdp);
- xdp->handle += xdp->data - xdp->data_hard_start;
switch (act) {
case XDP_PASS:
break;
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
- struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_xmit = 0;
bool failure = false;
- struct xdp_buff xdp;
-
- xdp.rxq = &rx_ring->xdp_rxq;
- xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
u8 rx_ptype;
if (cleaned_count >= ICE_RX_BUF_WRITE) {
- failure |= ice_alloc_rx_bufs_fast_zc(rx_ring,
- cleaned_count);
+ failure |= ice_alloc_rx_bufs_zc(rx_ring,
+ cleaned_count);
cleaned_count = 0;
}
if (!size)
break;
- rx_buf = ice_get_rx_buf_zc(rx_ring, size);
- if (!rx_buf->addr)
- break;
- xdp.data = rx_buf->addr;
- xdp.data_meta = xdp.data;
- xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
- xdp.data_end = xdp.data + size;
- xdp.handle = rx_buf->handle;
+ rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+ rx_buf->xdp->data_end = rx_buf->xdp->data + size;
+ xsk_buff_dma_sync_for_cpu(rx_buf->xdp);
- xdp_res = ice_run_xdp_zc(rx_ring, &xdp);
+ xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
if (xdp_res) {
- if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+ if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
xdp_xmit |= xdp_res;
- rx_buf->addr = NULL;
- } else {
- ice_reuse_rx_buf_zc(rx_ring, rx_buf);
- }
+ else
+ xsk_buff_free(rx_buf->xdp);
+ rx_buf->xdp = NULL;
total_rx_bytes += size;
total_rx_packets++;
cleaned_count++;
}
/* XDP_PASS path */
- skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp);
+ skb = ice_construct_skb_zc(rx_ring, rx_buf);
if (!skb) {
rx_ring->rx_stats.alloc_buf_failed++;
break;
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
- dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+ xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+ desc.len);
tx_buf->bytecount = desc.len;
for (i = 0; i < rx_ring->count; i++) {
struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
- if (!rx_buf->addr)
+ if (!rx_buf->xdp)
continue;
- xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_buf->handle);
- rx_buf->addr = NULL;
+ rx_buf->xdp = NULL;
}
}
#ifdef CONFIG_XDP_SOCKETS
int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid);
-void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget);
bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring, int budget);
int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
-bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count);
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count);
bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi);
void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring);
void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring);
return -EOPNOTSUPP;
}
-static inline void
-ice_zca_free(struct zero_copy_allocator __always_unused *zca,
- unsigned long __always_unused handle)
-{
-}
-
static inline int
ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring,
int __always_unused budget)
}
static inline bool
-ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring,
- u16 __always_unused count)
+ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring,
+ u16 __always_unused count)
{
return false;
}
u32 speed;
u32 supported, advertising;
- status = rd32(E1000_STATUS);
+ status = pm_runtime_suspended(&adapter->pdev->dev) ?
+ 0 : rd32(E1000_STATUS);
if (hw->phy.media_type == e1000_media_type_copper) {
supported = (SUPPORTED_10baseT_Half |
#include "igc_hw.h"
-/* forward declaration */
-void igc_set_ethtool_ops(struct net_device *);
+void igc_ethtool_set_ops(struct net_device *);
/* Transmit and receive queues */
#define IGC_MAX_RX_QUEUES 4
#define MAX_ETYPE_FILTER 8
#define IGC_RETA_SIZE 128
+enum igc_mac_filter_type {
+ IGC_MAC_FILTER_TYPE_DST = 0,
+ IGC_MAC_FILTER_TYPE_SRC
+};
+
struct igc_tx_queue_stats {
u64 packets;
u64 bytes;
u32 rss_queues;
u32 rss_indir_tbl_init;
- /* RX network flow classification support */
- struct hlist_head nfc_filter_list;
- unsigned int nfc_filter_count;
-
- /* lock for RX network flow classification filter */
- spinlock_t nfc_lock;
-
- struct igc_mac_addr *mac_table;
+ /* Any access to elements in nfc_rule_list is protected by the
+ * nfc_rule_lock.
+ */
+ struct mutex nfc_rule_lock;
+ struct list_head nfc_rule_list;
+ unsigned int nfc_rule_count;
u8 rss_indir_tbl[IGC_RETA_SIZE];
bool igc_has_link(struct igc_adapter *adapter);
void igc_reset(struct igc_adapter *adapter);
int igc_set_spd_dplx(struct igc_adapter *adapter, u32 spd, u8 dplx);
-int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr,
- const s8 queue, const u8 flags);
-int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr,
- const u8 flags);
-int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio,
- int queue);
-void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio);
-int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype, int queue);
-int igc_del_etype_filter(struct igc_adapter *adapter, u16 etype);
void igc_update_stats(struct igc_adapter *adapter);
/* igc_dump declarations */
IGC_FILTER_FLAG_DST_MAC_ADDR = 0x8,
};
-/* RX network flow classification data structure */
-struct igc_nfc_input {
- /* Byte layout in order, all values with MSB first:
- * match_flags - 1 byte
- * etype - 2 bytes
- * vlan_tci - 2 bytes
- */
+struct igc_nfc_filter {
u8 match_flags;
- __be16 etype;
- __be16 vlan_tci;
+ u16 etype;
+ u16 vlan_tci;
u8 src_addr[ETH_ALEN];
u8 dst_addr[ETH_ALEN];
};
-struct igc_nfc_filter {
- struct hlist_node nfc_node;
- struct igc_nfc_input filter;
- unsigned long cookie;
- u16 sw_idx;
+struct igc_nfc_rule {
+ struct list_head list;
+ struct igc_nfc_filter filter;
+ u32 location;
u16 action;
};
-struct igc_mac_addr {
- u8 addr[ETH_ALEN];
- s8 queue;
- u8 state; /* bitmask */
-};
-
-#define IGC_MAC_STATE_DEFAULT 0x1
-#define IGC_MAC_STATE_IN_USE 0x2
-#define IGC_MAC_STATE_SRC_ADDR 0x4
-
-#define IGC_MAX_RXNFC_FILTERS 16
+#define IGC_MAX_RXNFC_RULES 16
/* igc_desc_unused - calculate if we have unused descriptors */
static inline u16 igc_desc_unused(const struct igc_ring *ring)
return 0;
}
-/* forward declaration */
void igc_reinit_locked(struct igc_adapter *);
-int igc_add_filter(struct igc_adapter *adapter,
- struct igc_nfc_filter *input);
-int igc_erase_filter(struct igc_adapter *adapter,
- struct igc_nfc_filter *input);
+struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter,
+ u32 location);
+int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
+void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
void igc_ptp_init(struct igc_adapter *adapter);
void igc_ptp_reset(struct igc_adapter *adapter);
* (RAR[15]) for our directed address used by controllers with
* manageability enabled, allowing us room for 15 multicast addresses.
*/
+#define IGC_RAH_RAH_MASK 0x0000FFFF
+#define IGC_RAH_ASEL_MASK 0x00030000
+#define IGC_RAH_ASEL_SRC_ADDR BIT(16)
#define IGC_RAH_QSEL_MASK 0x000C0000
#define IGC_RAH_QSEL_SHIFT 18
#define IGC_RAH_QSEL_ENABLE BIT(28)
#define IGC_TXD_POPTS_IXSM 0x01 /* Insert IP checksum */
#define IGC_TXD_POPTS_TXSM 0x02 /* Insert TCP/UDP checksum */
#define IGC_TXD_CMD_EOP 0x01000000 /* End of Packet */
-#define IGC_TXD_CMD_IFCS 0x02000000 /* Insert FCS (Ethernet CRC) */
#define IGC_TXD_CMD_IC 0x04000000 /* Insert Checksum */
-#define IGC_TXD_CMD_RS 0x08000000 /* Report Status */
-#define IGC_TXD_CMD_RPS 0x10000000 /* Report Packet Sent */
#define IGC_TXD_CMD_DEXT 0x20000000 /* Desc extension (0 = legacy) */
#define IGC_TXD_CMD_VLE 0x40000000 /* Add VLAN tag */
-#define IGC_TXD_CMD_IDE 0x80000000 /* Enable Tidv register */
#define IGC_TXD_STAT_DD 0x00000001 /* Descriptor Done */
#define IGC_TXD_STAT_EC 0x00000002 /* Excess Collisions */
#define IGC_TXD_STAT_LC 0x00000004 /* Late Collisions */
#define IGC_PRIV_FLAGS_STR_LEN ARRAY_SIZE(igc_priv_flags_strings)
-static void igc_get_drvinfo(struct net_device *netdev,
- struct ethtool_drvinfo *drvinfo)
+static void igc_ethtool_get_drvinfo(struct net_device *netdev,
+ struct ethtool_drvinfo *drvinfo)
{
struct igc_adapter *adapter = netdev_priv(netdev);
drvinfo->n_priv_flags = IGC_PRIV_FLAGS_STR_LEN;
}
-static int igc_get_regs_len(struct net_device *netdev)
+static int igc_ethtool_get_regs_len(struct net_device *netdev)
{
return IGC_REGS_LEN * sizeof(u32);
}
-static void igc_get_regs(struct net_device *netdev,
- struct ethtool_regs *regs, void *p)
+static void igc_ethtool_get_regs(struct net_device *netdev,
+ struct ethtool_regs *regs, void *p)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_hw *hw = &adapter->hw;
regs_buff[205 + i] = rd32(IGC_ETQF(i));
}
-static void igc_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+static void igc_ethtool_get_wol(struct net_device *netdev,
+ struct ethtool_wolinfo *wol)
{
struct igc_adapter *adapter = netdev_priv(netdev);
wol->wolopts |= WAKE_PHY;
}
-static int igc_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+static int igc_ethtool_set_wol(struct net_device *netdev,
+ struct ethtool_wolinfo *wol)
{
struct igc_adapter *adapter = netdev_priv(netdev);
return 0;
}
-static u32 igc_get_msglevel(struct net_device *netdev)
+static u32 igc_ethtool_get_msglevel(struct net_device *netdev)
{
struct igc_adapter *adapter = netdev_priv(netdev);
return adapter->msg_enable;
}
-static void igc_set_msglevel(struct net_device *netdev, u32 data)
+static void igc_ethtool_set_msglevel(struct net_device *netdev, u32 data)
{
struct igc_adapter *adapter = netdev_priv(netdev);
adapter->msg_enable = data;
}
-static int igc_nway_reset(struct net_device *netdev)
+static int igc_ethtool_nway_reset(struct net_device *netdev)
{
struct igc_adapter *adapter = netdev_priv(netdev);
return 0;
}
-static u32 igc_get_link(struct net_device *netdev)
+static u32 igc_ethtool_get_link(struct net_device *netdev)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_mac_info *mac = &adapter->hw.mac;
return igc_has_link(adapter);
}
-static int igc_get_eeprom_len(struct net_device *netdev)
+static int igc_ethtool_get_eeprom_len(struct net_device *netdev)
{
struct igc_adapter *adapter = netdev_priv(netdev);
return adapter->hw.nvm.word_size * 2;
}
-static int igc_get_eeprom(struct net_device *netdev,
- struct ethtool_eeprom *eeprom, u8 *bytes)
+static int igc_ethtool_get_eeprom(struct net_device *netdev,
+ struct ethtool_eeprom *eeprom, u8 *bytes)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_hw *hw = &adapter->hw;
return ret_val;
}
-static int igc_set_eeprom(struct net_device *netdev,
- struct ethtool_eeprom *eeprom, u8 *bytes)
+static int igc_ethtool_set_eeprom(struct net_device *netdev,
+ struct ethtool_eeprom *eeprom, u8 *bytes)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_hw *hw = &adapter->hw;
return ret_val;
}
-static void igc_get_ringparam(struct net_device *netdev,
- struct ethtool_ringparam *ring)
+static void igc_ethtool_get_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring)
{
struct igc_adapter *adapter = netdev_priv(netdev);
ring->tx_pending = adapter->tx_ring_count;
}
-static int igc_set_ringparam(struct net_device *netdev,
- struct ethtool_ringparam *ring)
+static int igc_ethtool_set_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_ring *temp_ring;
return err;
}
-static void igc_get_pauseparam(struct net_device *netdev,
- struct ethtool_pauseparam *pause)
+static void igc_ethtool_get_pauseparam(struct net_device *netdev,
+ struct ethtool_pauseparam *pause)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_hw *hw = &adapter->hw;
}
}
-static int igc_set_pauseparam(struct net_device *netdev,
- struct ethtool_pauseparam *pause)
+static int igc_ethtool_set_pauseparam(struct net_device *netdev,
+ struct ethtool_pauseparam *pause)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_hw *hw = &adapter->hw;
return retval;
}
-static void igc_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+static void igc_ethtool_get_strings(struct net_device *netdev, u32 stringset,
+ u8 *data)
{
struct igc_adapter *adapter = netdev_priv(netdev);
u8 *p = data;
}
}
-static int igc_get_sset_count(struct net_device *netdev, int sset)
+static int igc_ethtool_get_sset_count(struct net_device *netdev, int sset)
{
switch (sset) {
case ETH_SS_STATS:
}
}
-static void igc_get_ethtool_stats(struct net_device *netdev,
+static void igc_ethtool_get_stats(struct net_device *netdev,
struct ethtool_stats *stats, u64 *data)
{
struct igc_adapter *adapter = netdev_priv(netdev);
spin_unlock(&adapter->stats64_lock);
}
-static int igc_get_coalesce(struct net_device *netdev,
- struct ethtool_coalesce *ec)
+static int igc_ethtool_get_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *ec)
{
struct igc_adapter *adapter = netdev_priv(netdev);
return 0;
}
-static int igc_set_coalesce(struct net_device *netdev,
- struct ethtool_coalesce *ec)
+static int igc_ethtool_set_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *ec)
{
struct igc_adapter *adapter = netdev_priv(netdev);
int i;
}
#define ETHER_TYPE_FULL_MASK ((__force __be16)~0)
-static int igc_get_ethtool_nfc_entry(struct igc_adapter *adapter,
- struct ethtool_rxnfc *cmd)
+static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter,
+ struct ethtool_rxnfc *cmd)
{
struct ethtool_rx_flow_spec *fsp = &cmd->fs;
- struct igc_nfc_filter *rule = NULL;
+ struct igc_nfc_rule *rule = NULL;
- /* report total rule count */
- cmd->data = IGC_MAX_RXNFC_FILTERS;
+ cmd->data = IGC_MAX_RXNFC_RULES;
- hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
- if (fsp->location <= rule->sw_idx)
- break;
+ mutex_lock(&adapter->nfc_rule_lock);
+
+ rule = igc_get_nfc_rule(adapter, fsp->location);
+ if (!rule)
+ goto out;
+
+ fsp->flow_type = ETHER_FLOW;
+ fsp->ring_cookie = rule->action;
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
+ fsp->h_u.ether_spec.h_proto = htons(rule->filter.etype);
+ fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK;
}
- if (!rule || fsp->location != rule->sw_idx)
- return -EINVAL;
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+ fsp->flow_type |= FLOW_EXT;
+ fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci);
+ fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
+ }
- if (rule->filter.match_flags) {
- fsp->flow_type = ETHER_FLOW;
- fsp->ring_cookie = rule->action;
- if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
- fsp->h_u.ether_spec.h_proto = rule->filter.etype;
- fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK;
- }
- if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
- fsp->flow_type |= FLOW_EXT;
- fsp->h_ext.vlan_tci = rule->filter.vlan_tci;
- fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
- }
- if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
- ether_addr_copy(fsp->h_u.ether_spec.h_dest,
- rule->filter.dst_addr);
- /* As we only support matching by the full
- * mask, return the mask to userspace
- */
- eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
- }
- if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
- ether_addr_copy(fsp->h_u.ether_spec.h_source,
- rule->filter.src_addr);
- /* As we only support matching by the full
- * mask, return the mask to userspace
- */
- eth_broadcast_addr(fsp->m_u.ether_spec.h_source);
- }
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
+ ether_addr_copy(fsp->h_u.ether_spec.h_dest,
+ rule->filter.dst_addr);
+ eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
+ }
- return 0;
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
+ ether_addr_copy(fsp->h_u.ether_spec.h_source,
+ rule->filter.src_addr);
+ eth_broadcast_addr(fsp->m_u.ether_spec.h_source);
}
+
+ mutex_unlock(&adapter->nfc_rule_lock);
+ return 0;
+
+out:
+ mutex_unlock(&adapter->nfc_rule_lock);
return -EINVAL;
}
-static int igc_get_ethtool_nfc_all(struct igc_adapter *adapter,
- struct ethtool_rxnfc *cmd,
- u32 *rule_locs)
+static int igc_ethtool_get_nfc_rules(struct igc_adapter *adapter,
+ struct ethtool_rxnfc *cmd,
+ u32 *rule_locs)
{
- struct igc_nfc_filter *rule;
+ struct igc_nfc_rule *rule;
int cnt = 0;
- /* report total rule count */
- cmd->data = IGC_MAX_RXNFC_FILTERS;
+ cmd->data = IGC_MAX_RXNFC_RULES;
+
+ mutex_lock(&adapter->nfc_rule_lock);
- hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
- if (cnt == cmd->rule_cnt)
+ list_for_each_entry(rule, &adapter->nfc_rule_list, list) {
+ if (cnt == cmd->rule_cnt) {
+ mutex_unlock(&adapter->nfc_rule_lock);
return -EMSGSIZE;
- rule_locs[cnt] = rule->sw_idx;
+ }
+ rule_locs[cnt] = rule->location;
cnt++;
}
+ mutex_unlock(&adapter->nfc_rule_lock);
+
cmd->rule_cnt = cnt;
return 0;
}
-static int igc_get_rss_hash_opts(struct igc_adapter *adapter,
- struct ethtool_rxnfc *cmd)
+static int igc_ethtool_get_rss_hash_opts(struct igc_adapter *adapter,
+ struct ethtool_rxnfc *cmd)
{
cmd->data = 0;
return 0;
}
-static int igc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
- u32 *rule_locs)
+static int igc_ethtool_get_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *cmd, u32 *rule_locs)
{
struct igc_adapter *adapter = netdev_priv(dev);
- int ret = -EOPNOTSUPP;
switch (cmd->cmd) {
case ETHTOOL_GRXRINGS:
cmd->data = adapter->num_rx_queues;
- ret = 0;
- break;
+ return 0;
case ETHTOOL_GRXCLSRLCNT:
- cmd->rule_cnt = adapter->nfc_filter_count;
- ret = 0;
- break;
+ cmd->rule_cnt = adapter->nfc_rule_count;
+ return 0;
case ETHTOOL_GRXCLSRULE:
- ret = igc_get_ethtool_nfc_entry(adapter, cmd);
- break;
+ return igc_ethtool_get_nfc_rule(adapter, cmd);
case ETHTOOL_GRXCLSRLALL:
- ret = igc_get_ethtool_nfc_all(adapter, cmd, rule_locs);
- break;
+ return igc_ethtool_get_nfc_rules(adapter, cmd, rule_locs);
case ETHTOOL_GRXFH:
- ret = igc_get_rss_hash_opts(adapter, cmd);
- break;
+ return igc_ethtool_get_rss_hash_opts(adapter, cmd);
default:
- break;
+ return -EOPNOTSUPP;
}
-
- return ret;
}
#define UDP_RSS_FLAGS (IGC_FLAG_RSS_FIELD_IPV4_UDP | \
IGC_FLAG_RSS_FIELD_IPV6_UDP)
-static int igc_set_rss_hash_opt(struct igc_adapter *adapter,
- struct ethtool_rxnfc *nfc)
+static int igc_ethtool_set_rss_hash_opt(struct igc_adapter *adapter,
+ struct ethtool_rxnfc *nfc)
{
u32 flags = adapter->flags;
return 0;
}
-int igc_add_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input)
+static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule,
+ const struct ethtool_rx_flow_spec *fsp)
{
- struct igc_hw *hw = &adapter->hw;
- int err = -EINVAL;
-
- if (hw->mac.type == igc_i225 &&
- !(input->filter.match_flags & ~IGC_FILTER_FLAG_SRC_MAC_ADDR)) {
- netdev_err(adapter->netdev,
- "i225 doesn't support flow classification rules specifying only source addresses\n");
- return -EOPNOTSUPP;
- }
+ INIT_LIST_HEAD(&rule->list);
- if (input->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
- u16 etype = ntohs(input->filter.etype);
+ rule->action = fsp->ring_cookie;
+ rule->location = fsp->location;
- err = igc_add_etype_filter(adapter, etype, input->action);
- if (err)
- return err;
+ if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
+ rule->filter.vlan_tci = ntohs(fsp->h_ext.vlan_tci);
+ rule->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI;
}
- if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
- err = igc_add_mac_filter(adapter, input->filter.dst_addr,
- input->action, 0);
- if (err)
- return err;
+ if (fsp->m_u.ether_spec.h_proto == ETHER_TYPE_FULL_MASK) {
+ rule->filter.etype = ntohs(fsp->h_u.ether_spec.h_proto);
+ rule->filter.match_flags = IGC_FILTER_FLAG_ETHER_TYPE;
}
- if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
- err = igc_add_mac_filter(adapter, input->filter.src_addr,
- input->action,
- IGC_MAC_STATE_SRC_ADDR);
- if (err)
- return err;
+ /* Both source and destination address filters only support the full
+ * mask.
+ */
+ if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
+ rule->filter.match_flags |= IGC_FILTER_FLAG_SRC_MAC_ADDR;
+ ether_addr_copy(rule->filter.src_addr,
+ fsp->h_u.ether_spec.h_source);
}
- if (input->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
- int prio = (ntohs(input->filter.vlan_tci) & VLAN_PRIO_MASK) >>
- VLAN_PRIO_SHIFT;
- err = igc_add_vlan_prio_filter(adapter, prio, input->action);
- if (err)
- return err;
+ if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
+ rule->filter.match_flags |= IGC_FILTER_FLAG_DST_MAC_ADDR;
+ ether_addr_copy(rule->filter.dst_addr,
+ fsp->h_u.ether_spec.h_dest);
}
-
- return 0;
}
-int igc_erase_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input)
+/**
+ * igc_ethtool_check_nfc_rule() - Check if NFC rule is valid
+ * @adapter: Pointer to adapter
+ * @rule: Rule under evaluation
+ *
+ * Rules with both destination and source MAC addresses are considered invalid
+ * since the driver doesn't support them.
+ *
+ * Also, if there is already another rule with the same filter in a different
+ * location, @rule is considered invalid.
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ *
+ * Return: 0 in case of success, negative errno code otherwise.
+ */
+static int igc_ethtool_check_nfc_rule(struct igc_adapter *adapter,
+ struct igc_nfc_rule *rule)
{
- if (input->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
- u16 etype = ntohs(input->filter.etype);
-
- igc_del_etype_filter(adapter, etype);
- }
+ struct net_device *dev = adapter->netdev;
+ u8 flags = rule->filter.match_flags;
+ struct igc_nfc_rule *tmp;
- if (input->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
- int prio = (ntohs(input->filter.vlan_tci) & VLAN_PRIO_MASK) >>
- VLAN_PRIO_SHIFT;
- igc_del_vlan_prio_filter(adapter, prio);
+ if (!flags) {
+ netdev_dbg(dev, "Rule with no match\n");
+ return -EINVAL;
}
- if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR)
- igc_del_mac_filter(adapter, input->filter.src_addr,
- IGC_MAC_STATE_SRC_ADDR);
-
- if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR)
- igc_del_mac_filter(adapter, input->filter.dst_addr, 0);
-
- return 0;
-}
-
-static int igc_update_ethtool_nfc_entry(struct igc_adapter *adapter,
- struct igc_nfc_filter *input,
- u16 sw_idx)
-{
- struct igc_nfc_filter *rule, *parent;
- int err = -EINVAL;
-
- parent = NULL;
- rule = NULL;
-
- hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
- /* hash found, or no matching entry */
- if (rule->sw_idx >= sw_idx)
- break;
- parent = rule;
+ if (flags & IGC_FILTER_FLAG_DST_MAC_ADDR &&
+ flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
+ netdev_dbg(dev, "Filters with both dst and src are not supported\n");
+ return -EOPNOTSUPP;
}
- /* if there is an old rule occupying our place remove it */
- if (rule && rule->sw_idx == sw_idx) {
- if (!input)
- err = igc_erase_filter(adapter, rule);
-
- hlist_del(&rule->nfc_node);
- kfree(rule);
- adapter->nfc_filter_count--;
+ list_for_each_entry(tmp, &adapter->nfc_rule_list, list) {
+ if (!memcmp(&rule->filter, &tmp->filter,
+ sizeof(rule->filter)) &&
+ tmp->location != rule->location) {
+ netdev_dbg(dev, "Rule already exists\n");
+ return -EEXIST;
+ }
}
- /* If no input this was a delete, err should be 0 if a rule was
- * successfully found and removed from the list else -EINVAL
- */
- if (!input)
- return err;
-
- /* initialize node */
- INIT_HLIST_NODE(&input->nfc_node);
-
- /* add filter to the list */
- if (parent)
- hlist_add_behind(&input->nfc_node, &parent->nfc_node);
- else
- hlist_add_head(&input->nfc_node, &adapter->nfc_filter_list);
-
- /* update counts */
- adapter->nfc_filter_count++;
-
return 0;
}
-static int igc_add_ethtool_nfc_entry(struct igc_adapter *adapter,
- struct ethtool_rxnfc *cmd)
+static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter,
+ struct ethtool_rxnfc *cmd)
{
struct net_device *netdev = adapter->netdev;
struct ethtool_rx_flow_spec *fsp =
(struct ethtool_rx_flow_spec *)&cmd->fs;
- struct igc_nfc_filter *input, *rule;
- int err = 0;
+ struct igc_nfc_rule *rule, *old_rule;
+ int err;
- if (!(netdev->hw_features & NETIF_F_NTUPLE))
+ if (!(netdev->hw_features & NETIF_F_NTUPLE)) {
+ netdev_dbg(netdev, "N-tuple filters disabled\n");
return -EOPNOTSUPP;
+ }
- /* Don't allow programming if the action is a queue greater than
- * the number of online Rx queues.
- */
- if (fsp->ring_cookie == RX_CLS_FLOW_DISC ||
- fsp->ring_cookie >= adapter->num_rx_queues) {
- netdev_err(netdev,
- "ethtool -N: The specified action is invalid\n");
- return -EINVAL;
+ if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW) {
+ netdev_dbg(netdev, "Only ethernet flow type is supported\n");
+ return -EOPNOTSUPP;
}
- /* Don't allow indexes to exist outside of available space */
- if (fsp->location >= IGC_MAX_RXNFC_FILTERS) {
- netdev_err(netdev, "Location out of range\n");
- return -EINVAL;
+ if ((fsp->flow_type & FLOW_EXT) &&
+ fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
+ netdev_dbg(netdev, "VLAN mask not supported\n");
+ return -EOPNOTSUPP;
}
- if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW)
+ if (fsp->ring_cookie >= adapter->num_rx_queues) {
+ netdev_dbg(netdev, "Invalid action\n");
return -EINVAL;
-
- input = kzalloc(sizeof(*input), GFP_KERNEL);
- if (!input)
- return -ENOMEM;
-
- if (fsp->m_u.ether_spec.h_proto == ETHER_TYPE_FULL_MASK) {
- input->filter.etype = fsp->h_u.ether_spec.h_proto;
- input->filter.match_flags = IGC_FILTER_FLAG_ETHER_TYPE;
}
- /* Only support matching addresses by the full mask */
- if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
- input->filter.match_flags |= IGC_FILTER_FLAG_SRC_MAC_ADDR;
- ether_addr_copy(input->filter.src_addr,
- fsp->h_u.ether_spec.h_source);
+ if (fsp->location >= IGC_MAX_RXNFC_RULES) {
+ netdev_dbg(netdev, "Invalid location\n");
+ return -EINVAL;
}
- /* Only support matching addresses by the full mask */
- if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
- input->filter.match_flags |= IGC_FILTER_FLAG_DST_MAC_ADDR;
- ether_addr_copy(input->filter.dst_addr,
- fsp->h_u.ether_spec.h_dest);
- }
+ rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+ if (!rule)
+ return -ENOMEM;
- if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
- if (fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
- netdev_dbg(netdev, "VLAN mask not supported\n");
- err = -EOPNOTSUPP;
- goto err_out;
- }
- input->filter.vlan_tci = fsp->h_ext.vlan_tci;
- input->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI;
- }
+ igc_ethtool_init_nfc_rule(rule, fsp);
- input->action = fsp->ring_cookie;
- input->sw_idx = fsp->location;
+ mutex_lock(&adapter->nfc_rule_lock);
- spin_lock(&adapter->nfc_lock);
+ err = igc_ethtool_check_nfc_rule(adapter, rule);
+ if (err)
+ goto err;
- hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
- if (!memcmp(&input->filter, &rule->filter,
- sizeof(input->filter))) {
- err = -EEXIST;
- netdev_err(netdev,
- "ethtool: this filter is already set\n");
- goto err_out_w_lock;
- }
- }
+ old_rule = igc_get_nfc_rule(adapter, fsp->location);
+ if (old_rule)
+ igc_del_nfc_rule(adapter, old_rule);
- err = igc_add_filter(adapter, input);
+ err = igc_add_nfc_rule(adapter, rule);
if (err)
- goto err_out_w_lock;
-
- igc_update_ethtool_nfc_entry(adapter, input, input->sw_idx);
+ goto err;
- spin_unlock(&adapter->nfc_lock);
+ mutex_unlock(&adapter->nfc_rule_lock);
return 0;
-err_out_w_lock:
- spin_unlock(&adapter->nfc_lock);
-err_out:
- kfree(input);
+err:
+ mutex_unlock(&adapter->nfc_rule_lock);
+ kfree(rule);
return err;
}
-static int igc_del_ethtool_nfc_entry(struct igc_adapter *adapter,
- struct ethtool_rxnfc *cmd)
+static int igc_ethtool_del_nfc_rule(struct igc_adapter *adapter,
+ struct ethtool_rxnfc *cmd)
{
struct ethtool_rx_flow_spec *fsp =
(struct ethtool_rx_flow_spec *)&cmd->fs;
- int err;
+ struct igc_nfc_rule *rule;
- spin_lock(&adapter->nfc_lock);
- err = igc_update_ethtool_nfc_entry(adapter, NULL, fsp->location);
- spin_unlock(&adapter->nfc_lock);
+ mutex_lock(&adapter->nfc_rule_lock);
- return err;
+ rule = igc_get_nfc_rule(adapter, fsp->location);
+ if (!rule) {
+ mutex_unlock(&adapter->nfc_rule_lock);
+ return -EINVAL;
+ }
+
+ igc_del_nfc_rule(adapter, rule);
+
+ mutex_unlock(&adapter->nfc_rule_lock);
+ return 0;
}
-static int igc_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+static int igc_ethtool_set_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *cmd)
{
struct igc_adapter *adapter = netdev_priv(dev);
- int ret = -EOPNOTSUPP;
switch (cmd->cmd) {
case ETHTOOL_SRXFH:
- ret = igc_set_rss_hash_opt(adapter, cmd);
- break;
+ return igc_ethtool_set_rss_hash_opt(adapter, cmd);
case ETHTOOL_SRXCLSRLINS:
- ret = igc_add_ethtool_nfc_entry(adapter, cmd);
- break;
+ return igc_ethtool_add_nfc_rule(adapter, cmd);
case ETHTOOL_SRXCLSRLDEL:
- ret = igc_del_ethtool_nfc_entry(adapter, cmd);
+ return igc_ethtool_del_nfc_rule(adapter, cmd);
default:
- break;
+ return -EOPNOTSUPP;
}
-
- return ret;
}
void igc_write_rss_indir_tbl(struct igc_adapter *adapter)
}
}
-static u32 igc_get_rxfh_indir_size(struct net_device *netdev)
+static u32 igc_ethtool_get_rxfh_indir_size(struct net_device *netdev)
{
return IGC_RETA_SIZE;
}
-static int igc_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
- u8 *hfunc)
+static int igc_ethtool_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+ u8 *hfunc)
{
struct igc_adapter *adapter = netdev_priv(netdev);
int i;
return 0;
}
-static int igc_set_rxfh(struct net_device *netdev, const u32 *indir,
- const u8 *key, const u8 hfunc)
+static int igc_ethtool_set_rxfh(struct net_device *netdev, const u32 *indir,
+ const u8 *key, const u8 hfunc)
{
struct igc_adapter *adapter = netdev_priv(netdev);
u32 num_queues;
return 0;
}
-static unsigned int igc_max_channels(struct igc_adapter *adapter)
-{
- return igc_get_max_rss_queues(adapter);
-}
-
-static void igc_get_channels(struct net_device *netdev,
- struct ethtool_channels *ch)
+static void igc_ethtool_get_channels(struct net_device *netdev,
+ struct ethtool_channels *ch)
{
struct igc_adapter *adapter = netdev_priv(netdev);
/* Report maximum channels */
- ch->max_combined = igc_max_channels(adapter);
+ ch->max_combined = igc_get_max_rss_queues(adapter);
/* Report info for other vector */
if (adapter->flags & IGC_FLAG_HAS_MSIX) {
ch->combined_count = adapter->rss_queues;
}
-static int igc_set_channels(struct net_device *netdev,
- struct ethtool_channels *ch)
+static int igc_ethtool_set_channels(struct net_device *netdev,
+ struct ethtool_channels *ch)
{
struct igc_adapter *adapter = netdev_priv(netdev);
unsigned int count = ch->combined_count;
return -EINVAL;
/* Verify the number of channels doesn't exceed hw limits */
- max_combined = igc_max_channels(adapter);
+ max_combined = igc_get_max_rss_queues(adapter);
if (count > max_combined)
return -EINVAL;
return 0;
}
-static int igc_get_ts_info(struct net_device *dev,
- struct ethtool_ts_info *info)
+static int igc_ethtool_get_ts_info(struct net_device *dev,
+ struct ethtool_ts_info *info)
{
struct igc_adapter *adapter = netdev_priv(dev);
}
}
-static u32 igc_get_priv_flags(struct net_device *netdev)
+static u32 igc_ethtool_get_priv_flags(struct net_device *netdev)
{
struct igc_adapter *adapter = netdev_priv(netdev);
u32 priv_flags = 0;
return priv_flags;
}
-static int igc_set_priv_flags(struct net_device *netdev, u32 priv_flags)
+static int igc_ethtool_set_priv_flags(struct net_device *netdev, u32 priv_flags)
{
struct igc_adapter *adapter = netdev_priv(netdev);
unsigned int flags = adapter->flags;
pm_runtime_put(&adapter->pdev->dev);
}
-static int igc_get_link_ksettings(struct net_device *netdev,
- struct ethtool_link_ksettings *cmd)
+static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
+ struct ethtool_link_ksettings *cmd)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_hw *hw = &adapter->hw;
return 0;
}
-static int igc_set_link_ksettings(struct net_device *netdev,
- const struct ethtool_link_ksettings *cmd)
+static int
+igc_ethtool_set_link_ksettings(struct net_device *netdev,
+ const struct ethtool_link_ksettings *cmd)
{
struct igc_adapter *adapter = netdev_priv(netdev);
struct net_device *dev = adapter->netdev;
return 0;
}
-static void igc_diag_test(struct net_device *netdev,
- struct ethtool_test *eth_test, u64 *data)
+static void igc_ethtool_diag_test(struct net_device *netdev,
+ struct ethtool_test *eth_test, u64 *data)
{
struct igc_adapter *adapter = netdev_priv(netdev);
bool if_running = netif_running(netdev);
static const struct ethtool_ops igc_ethtool_ops = {
.supported_coalesce_params = ETHTOOL_COALESCE_USECS,
- .get_drvinfo = igc_get_drvinfo,
- .get_regs_len = igc_get_regs_len,
- .get_regs = igc_get_regs,
- .get_wol = igc_get_wol,
- .set_wol = igc_set_wol,
- .get_msglevel = igc_get_msglevel,
- .set_msglevel = igc_set_msglevel,
- .nway_reset = igc_nway_reset,
- .get_link = igc_get_link,
- .get_eeprom_len = igc_get_eeprom_len,
- .get_eeprom = igc_get_eeprom,
- .set_eeprom = igc_set_eeprom,
- .get_ringparam = igc_get_ringparam,
- .set_ringparam = igc_set_ringparam,
- .get_pauseparam = igc_get_pauseparam,
- .set_pauseparam = igc_set_pauseparam,
- .get_strings = igc_get_strings,
- .get_sset_count = igc_get_sset_count,
- .get_ethtool_stats = igc_get_ethtool_stats,
- .get_coalesce = igc_get_coalesce,
- .set_coalesce = igc_set_coalesce,
- .get_rxnfc = igc_get_rxnfc,
- .set_rxnfc = igc_set_rxnfc,
- .get_rxfh_indir_size = igc_get_rxfh_indir_size,
- .get_rxfh = igc_get_rxfh,
- .set_rxfh = igc_set_rxfh,
- .get_ts_info = igc_get_ts_info,
- .get_channels = igc_get_channels,
- .set_channels = igc_set_channels,
- .get_priv_flags = igc_get_priv_flags,
- .set_priv_flags = igc_set_priv_flags,
+ .get_drvinfo = igc_ethtool_get_drvinfo,
+ .get_regs_len = igc_ethtool_get_regs_len,
+ .get_regs = igc_ethtool_get_regs,
+ .get_wol = igc_ethtool_get_wol,
+ .set_wol = igc_ethtool_set_wol,
+ .get_msglevel = igc_ethtool_get_msglevel,
+ .set_msglevel = igc_ethtool_set_msglevel,
+ .nway_reset = igc_ethtool_nway_reset,
+ .get_link = igc_ethtool_get_link,
+ .get_eeprom_len = igc_ethtool_get_eeprom_len,
+ .get_eeprom = igc_ethtool_get_eeprom,
+ .set_eeprom = igc_ethtool_set_eeprom,
+ .get_ringparam = igc_ethtool_get_ringparam,
+ .set_ringparam = igc_ethtool_set_ringparam,
+ .get_pauseparam = igc_ethtool_get_pauseparam,
+ .set_pauseparam = igc_ethtool_set_pauseparam,
+ .get_strings = igc_ethtool_get_strings,
+ .get_sset_count = igc_ethtool_get_sset_count,
+ .get_ethtool_stats = igc_ethtool_get_stats,
+ .get_coalesce = igc_ethtool_get_coalesce,
+ .set_coalesce = igc_ethtool_set_coalesce,
+ .get_rxnfc = igc_ethtool_get_rxnfc,
+ .set_rxnfc = igc_ethtool_set_rxnfc,
+ .get_rxfh_indir_size = igc_ethtool_get_rxfh_indir_size,
+ .get_rxfh = igc_ethtool_get_rxfh,
+ .set_rxfh = igc_ethtool_set_rxfh,
+ .get_ts_info = igc_ethtool_get_ts_info,
+ .get_channels = igc_ethtool_get_channels,
+ .set_channels = igc_ethtool_set_channels,
+ .get_priv_flags = igc_ethtool_get_priv_flags,
+ .set_priv_flags = igc_ethtool_set_priv_flags,
.begin = igc_ethtool_begin,
.complete = igc_ethtool_complete,
- .get_link_ksettings = igc_get_link_ksettings,
- .set_link_ksettings = igc_set_link_ksettings,
- .self_test = igc_diag_test,
+ .get_link_ksettings = igc_ethtool_get_link_ksettings,
+ .set_link_ksettings = igc_ethtool_set_link_ksettings,
+ .self_test = igc_ethtool_diag_test,
};
-void igc_set_ethtool_ops(struct net_device *netdev)
+void igc_ethtool_set_ops(struct net_device *netdev)
{
netdev->ethtool_ops = &igc_ethtool_ops;
}
rd32(IGC_ICTXQMTC);
rd32(IGC_ICRXDMTC);
- rd32(IGC_CBTMPC);
- rd32(IGC_HTDPMC);
- rd32(IGC_CBRMPC);
rd32(IGC_RPTHC);
rd32(IGC_HGPTC);
- rd32(IGC_HTCBDPC);
rd32(IGC_HGORCL);
rd32(IGC_HGORCH);
rd32(IGC_HGOTCL);
* igc_set_mac_filter_hw() - Set MAC address filter in hardware
* @adapter: Pointer to adapter where the filter should be set
* @index: Filter index
- * @addr: Destination MAC address
+ * @type: MAC address filter type (source or destination)
+ * @addr: MAC address
* @queue: If non-negative, queue assignment feature is enabled and frames
* matching the filter are enqueued onto 'queue'. Otherwise, queue
* assignment is disabled.
*/
static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index,
+ enum igc_mac_filter_type type,
const u8 *addr, int queue)
{
struct net_device *dev = adapter->netdev;
ral = le32_to_cpup((__le32 *)(addr));
rah = le16_to_cpup((__le16 *)(addr + 4));
+ if (type == IGC_MAC_FILTER_TYPE_SRC) {
+ rah &= ~IGC_RAH_ASEL_MASK;
+ rah |= IGC_RAH_ASEL_SRC_ADDR;
+ }
+
if (queue >= 0) {
rah &= ~IGC_RAH_QSEL_MASK;
rah |= (queue << IGC_RAH_QSEL_SHIFT);
/* Set default MAC address for the PF in the first RAR entry */
static void igc_set_default_mac_filter(struct igc_adapter *adapter)
{
- struct igc_mac_addr *mac_table = &adapter->mac_table[0];
struct net_device *dev = adapter->netdev;
u8 *addr = adapter->hw.mac.addr;
netdev_dbg(dev, "Set default MAC address filter: address %pM", addr);
- ether_addr_copy(mac_table->addr, addr);
- mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE;
- mac_table->queue = -1;
-
- igc_set_mac_filter_hw(adapter, 0, addr, mac_table->queue);
+ igc_set_mac_filter_hw(adapter, 0, IGC_MAC_FILTER_TYPE_DST, addr, -1);
}
/**
return !!budget;
}
-static void igc_nfc_filter_restore(struct igc_adapter *adapter)
-{
- struct igc_nfc_filter *rule;
-
- spin_lock(&adapter->nfc_lock);
-
- hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node)
- igc_add_filter(adapter, rule);
-
- spin_unlock(&adapter->nfc_lock);
-}
-
-static int igc_find_mac_filter(struct igc_adapter *adapter, const u8 *addr,
- u8 flags)
+static int igc_find_mac_filter(struct igc_adapter *adapter,
+ enum igc_mac_filter_type type, const u8 *addr)
{
- int max_entries = adapter->hw.mac.rar_entry_count;
- struct igc_mac_addr *entry;
+ struct igc_hw *hw = &adapter->hw;
+ int max_entries = hw->mac.rar_entry_count;
+ u32 ral, rah;
int i;
for (i = 0; i < max_entries; i++) {
- entry = &adapter->mac_table[i];
+ ral = rd32(IGC_RAL(i));
+ rah = rd32(IGC_RAH(i));
- if (!(entry->state & IGC_MAC_STATE_IN_USE))
+ if (!(rah & IGC_RAH_AV))
continue;
- if (!ether_addr_equal(addr, entry->addr))
+ if (!!(rah & IGC_RAH_ASEL_SRC_ADDR) != type)
continue;
- if ((entry->state & IGC_MAC_STATE_SRC_ADDR) !=
- (flags & IGC_MAC_STATE_SRC_ADDR))
+ if ((rah & IGC_RAH_RAH_MASK) !=
+ le16_to_cpup((__le16 *)(addr + 4)))
+ continue;
+ if (ral != le32_to_cpup((__le32 *)(addr)))
continue;
return i;
static int igc_get_avail_mac_filter_slot(struct igc_adapter *adapter)
{
- int max_entries = adapter->hw.mac.rar_entry_count;
- struct igc_mac_addr *entry;
+ struct igc_hw *hw = &adapter->hw;
+ int max_entries = hw->mac.rar_entry_count;
+ u32 rah;
int i;
for (i = 0; i < max_entries; i++) {
- entry = &adapter->mac_table[i];
+ rah = rd32(IGC_RAH(i));
- if (!(entry->state & IGC_MAC_STATE_IN_USE))
+ if (!(rah & IGC_RAH_AV))
return i;
}
/**
* igc_add_mac_filter() - Add MAC address filter
* @adapter: Pointer to adapter where the filter should be added
+ * @type: MAC address filter type (source or destination)
* @addr: MAC address
* @queue: If non-negative, queue assignment feature is enabled and frames
* matching the filter are enqueued onto 'queue'. Otherwise, queue
* assignment is disabled.
- * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source
- * address
*
* Return: 0 in case of success, negative errno code otherwise.
*/
-int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr,
- const s8 queue, const u8 flags)
+static int igc_add_mac_filter(struct igc_adapter *adapter,
+ enum igc_mac_filter_type type, const u8 *addr,
+ int queue)
{
struct net_device *dev = adapter->netdev;
int index;
- if (!is_valid_ether_addr(addr))
- return -EINVAL;
- if (flags & IGC_MAC_STATE_SRC_ADDR)
- return -ENOTSUPP;
-
- index = igc_find_mac_filter(adapter, addr, flags);
+ index = igc_find_mac_filter(adapter, type, addr);
if (index >= 0)
- goto update_queue_assignment;
+ goto update_filter;
index = igc_get_avail_mac_filter_slot(adapter);
if (index < 0)
return -ENOSPC;
- netdev_dbg(dev, "Add MAC address filter: index %d address %pM queue %d",
- index, addr, queue);
+ netdev_dbg(dev, "Add MAC address filter: index %d type %s address %pM queue %d\n",
+ index, type == IGC_MAC_FILTER_TYPE_DST ? "dst" : "src",
+ addr, queue);
- ether_addr_copy(adapter->mac_table[index].addr, addr);
- adapter->mac_table[index].state |= IGC_MAC_STATE_IN_USE | flags;
-update_queue_assignment:
- adapter->mac_table[index].queue = queue;
-
- igc_set_mac_filter_hw(adapter, index, addr, queue);
+update_filter:
+ igc_set_mac_filter_hw(adapter, index, type, addr, queue);
return 0;
}
/**
* igc_del_mac_filter() - Delete MAC address filter
* @adapter: Pointer to adapter where the filter should be deleted from
+ * @type: MAC address filter type (source or destination)
* @addr: MAC address
- * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source
- * address
- *
- * Return: 0 in case of success, negative errno code otherwise.
*/
-int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr,
- const u8 flags)
+static void igc_del_mac_filter(struct igc_adapter *adapter,
+ enum igc_mac_filter_type type, const u8 *addr)
{
struct net_device *dev = adapter->netdev;
- struct igc_mac_addr *entry;
int index;
- if (!is_valid_ether_addr(addr))
- return -EINVAL;
-
- index = igc_find_mac_filter(adapter, addr, flags);
+ index = igc_find_mac_filter(adapter, type, addr);
if (index < 0)
- return -ENOENT;
-
- entry = &adapter->mac_table[index];
+ return;
- if (entry->state & IGC_MAC_STATE_DEFAULT) {
+ if (index == 0) {
/* If this is the default filter, we don't actually delete it.
* We just reset to its default value i.e. disable queue
* assignment.
*/
netdev_dbg(dev, "Disable default MAC filter queue assignment");
- entry->queue = -1;
- igc_set_mac_filter_hw(adapter, 0, addr, entry->queue);
+ igc_set_mac_filter_hw(adapter, 0, type, addr, -1);
} else {
- netdev_dbg(dev, "Delete MAC address filter: index %d address %pM",
- index, addr);
+ netdev_dbg(dev, "Delete MAC address filter: index %d type %s address %pM\n",
+ index,
+ type == IGC_MAC_FILTER_TYPE_DST ? "dst" : "src",
+ addr);
- entry->state = 0;
- entry->queue = -1;
- memset(entry->addr, 0, ETH_ALEN);
igc_clear_mac_filter_hw(adapter, index);
}
-
- return 0;
}
/**
*
* Return: 0 in case of success, negative errno code otherwise.
*/
-int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio, int queue)
+static int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio,
+ int queue)
{
struct net_device *dev = adapter->netdev;
struct igc_hw *hw = &adapter->hw;
* @adapter: Pointer to adapter where the filter should be deleted from
* @prio: VLAN priority value
*/
-void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio)
+static void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio)
{
struct igc_hw *hw = &adapter->hw;
u32 vlanpqf;
*
* Return: 0 in case of success, negative errno code otherwise.
*/
-int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype, int queue)
+static int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype,
+ int queue)
{
struct igc_hw *hw = &adapter->hw;
int index;
* igc_del_etype_filter() - Delete ethertype filter
* @adapter: Pointer to adapter where the filter should be deleted from
* @etype: Ethertype value
- *
- * Return: 0 in case of success, negative errno code otherwise.
*/
-int igc_del_etype_filter(struct igc_adapter *adapter, u16 etype)
+static void igc_del_etype_filter(struct igc_adapter *adapter, u16 etype)
{
struct igc_hw *hw = &adapter->hw;
int index;
index = igc_find_etype_filter(adapter, etype);
if (index < 0)
- return -ENOENT;
+ return;
wr32(IGC_ETQF(index), 0);
netdev_dbg(adapter->netdev, "Delete ethertype filter: etype %04x\n",
etype);
+}
+
+static int igc_enable_nfc_rule(struct igc_adapter *adapter,
+ const struct igc_nfc_rule *rule)
+{
+ int err;
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
+ err = igc_add_etype_filter(adapter, rule->filter.etype,
+ rule->action);
+ if (err)
+ return err;
+ }
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
+ err = igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_SRC,
+ rule->filter.src_addr, rule->action);
+ if (err)
+ return err;
+ }
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
+ err = igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST,
+ rule->filter.dst_addr, rule->action);
+ if (err)
+ return err;
+ }
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+ int prio = (rule->filter.vlan_tci & VLAN_PRIO_MASK) >>
+ VLAN_PRIO_SHIFT;
+
+ err = igc_add_vlan_prio_filter(adapter, prio, rule->action);
+ if (err)
+ return err;
+ }
+
return 0;
}
+static void igc_disable_nfc_rule(struct igc_adapter *adapter,
+ const struct igc_nfc_rule *rule)
+{
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE)
+ igc_del_etype_filter(adapter, rule->filter.etype);
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+ int prio = (rule->filter.vlan_tci & VLAN_PRIO_MASK) >>
+ VLAN_PRIO_SHIFT;
+
+ igc_del_vlan_prio_filter(adapter, prio);
+ }
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR)
+ igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_SRC,
+ rule->filter.src_addr);
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR)
+ igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST,
+ rule->filter.dst_addr);
+}
+
+/**
+ * igc_get_nfc_rule() - Get NFC rule
+ * @adapter: Pointer to adapter
+ * @location: Rule location
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ *
+ * Return: Pointer to NFC rule at @location. If not found, NULL.
+ */
+struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter,
+ u32 location)
+{
+ struct igc_nfc_rule *rule;
+
+ list_for_each_entry(rule, &adapter->nfc_rule_list, list) {
+ if (rule->location == location)
+ return rule;
+ if (rule->location > location)
+ break;
+ }
+
+ return NULL;
+}
+
+/**
+ * igc_del_nfc_rule() - Delete NFC rule
+ * @adapter: Pointer to adapter
+ * @rule: Pointer to rule to be deleted
+ *
+ * Disable NFC rule in hardware and delete it from adapter.
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ */
+void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule)
+{
+ igc_disable_nfc_rule(adapter, rule);
+
+ list_del(&rule->list);
+ adapter->nfc_rule_count--;
+
+ kfree(rule);
+}
+
+static void igc_flush_nfc_rules(struct igc_adapter *adapter)
+{
+ struct igc_nfc_rule *rule, *tmp;
+
+ mutex_lock(&adapter->nfc_rule_lock);
+
+ list_for_each_entry_safe(rule, tmp, &adapter->nfc_rule_list, list)
+ igc_del_nfc_rule(adapter, rule);
+
+ mutex_unlock(&adapter->nfc_rule_lock);
+}
+
+/**
+ * igc_add_nfc_rule() - Add NFC rule
+ * @adapter: Pointer to adapter
+ * @rule: Pointer to rule to be added
+ *
+ * Enable NFC rule in hardware and add it to adapter.
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule)
+{
+ struct igc_nfc_rule *pred, *cur;
+ int err;
+
+ err = igc_enable_nfc_rule(adapter, rule);
+ if (err)
+ return err;
+
+ pred = NULL;
+ list_for_each_entry(cur, &adapter->nfc_rule_list, list) {
+ if (cur->location >= rule->location)
+ break;
+ pred = cur;
+ }
+
+ list_add(&rule->list, pred ? &pred->list : &adapter->nfc_rule_list);
+ adapter->nfc_rule_count++;
+ return 0;
+}
+
+static void igc_restore_nfc_rules(struct igc_adapter *adapter)
+{
+ struct igc_nfc_rule *rule;
+
+ mutex_lock(&adapter->nfc_rule_lock);
+
+ list_for_each_entry_reverse(rule, &adapter->nfc_rule_list, list)
+ igc_enable_nfc_rule(adapter, rule);
+
+ mutex_unlock(&adapter->nfc_rule_lock);
+}
+
static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr)
{
struct igc_adapter *adapter = netdev_priv(netdev);
- return igc_add_mac_filter(adapter, addr, -1, 0);
+ return igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, addr, -1);
}
static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr)
{
struct igc_adapter *adapter = netdev_priv(netdev);
- return igc_del_mac_filter(adapter, addr, 0);
+ igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, addr);
+ return 0;
}
/**
igc_setup_rctl(adapter);
igc_set_default_mac_filter(adapter);
- igc_nfc_filter_restore(adapter);
+ igc_restore_nfc_rules(adapter);
igc_configure_tx(adapter);
igc_configure_rx(adapter);
unsigned int igc_get_max_rss_queues(struct igc_adapter *adapter)
{
- unsigned int max_rss_queues;
-
- /* Determine the maximum number of RSS queues supported. */
- max_rss_queues = IGC_MAX_RX_QUEUES;
-
- return max_rss_queues;
+ return IGC_MAX_RX_QUEUES;
}
static void igc_init_queue_configuration(struct igc_adapter *adapter)
struct pci_dev *pdev = adapter->pdev;
struct igc_hw *hw = &adapter->hw;
- int size = sizeof(struct igc_mac_addr) * hw->mac.rar_entry_count;
-
pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word);
/* set default ring sizes */
VLAN_HLEN;
adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;
- spin_lock_init(&adapter->nfc_lock);
+ mutex_init(&adapter->nfc_rule_lock);
+ INIT_LIST_HEAD(&adapter->nfc_rule_list);
+ adapter->nfc_rule_count = 0;
+
spin_lock_init(&adapter->stats64_lock);
/* Assume MSI-X interrupts, will be checked during IRQ allocation */
adapter->flags |= IGC_FLAG_HAS_MSIX;
- adapter->mac_table = kzalloc(size, GFP_ATOMIC);
- if (!adapter->mac_table)
- return -ENOMEM;
-
igc_init_queue_configuration(adapter);
/* This call may decrease the number of queues */
adapter->stats.mgpdc += rd32(IGC_MGTPDC);
}
-static void igc_nfc_filter_exit(struct igc_adapter *adapter)
-{
- struct igc_nfc_filter *rule;
-
- spin_lock(&adapter->nfc_lock);
-
- hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node)
- igc_erase_filter(adapter, rule);
-
- spin_unlock(&adapter->nfc_lock);
-}
-
/**
* igc_down - Close the interface
* @adapter: board private structure
wr32(IGC_RCTL, rctl & ~IGC_RCTL_EN);
/* flush and sleep below */
- igc_nfc_filter_exit(adapter);
-
/* set trans_start so we don't get spurious watchdogs during reset */
netif_trans_update(netdev);
if (!(changed & (NETIF_F_RXALL | NETIF_F_NTUPLE)))
return 0;
- if (!(features & NETIF_F_NTUPLE)) {
- struct hlist_node *node2;
- struct igc_nfc_filter *rule;
-
- spin_lock(&adapter->nfc_lock);
- hlist_for_each_entry_safe(rule, node2,
- &adapter->nfc_filter_list, nfc_node) {
- igc_erase_filter(adapter, rule);
- hlist_del(&rule->nfc_node);
- kfree(rule);
- }
- spin_unlock(&adapter->nfc_lock);
- adapter->nfc_filter_count = 0;
- }
+ if (!(features & NETIF_F_NTUPLE))
+ igc_flush_nfc_rules(adapter);
netdev->features = features;
hw->hw_addr = adapter->io_addr;
netdev->netdev_ops = &igc_netdev_ops;
- igc_set_ethtool_ops(netdev);
+ igc_ethtool_set_ops(netdev);
netdev->watchdog_timeo = 5 * HZ;
netdev->mem_start = pci_resource_start(pdev, 0);
pm_runtime_get_noresume(&pdev->dev);
+ igc_flush_nfc_rules(adapter);
+
igc_ptp_stop(adapter);
set_bit(__IGC_DOWN, &adapter->state);
pci_iounmap(pdev, adapter->io_addr);
pci_release_mem_regions(pdev);
- kfree(adapter->mac_table);
free_netdev(netdev);
pci_disable_pcie_error_reporting(pdev);
#define IGC_ICRXDMTC 0x04120 /* Rx Descriptor Min Threshold Count */
#define IGC_ICRXOC 0x04124 /* Receiver Overrun Count */
-#define IGC_CBTMPC 0x0402C /* Circuit Breaker TX Packet Count */
-#define IGC_HTDPMC 0x0403C /* Host Transmit Discarded Packets */
-#define IGC_CBRMPC 0x040FC /* Circuit Breaker RX Packet Count */
-#define IGC_RPTHC 0x04104 /* Rx Packets To Host */
-#define IGC_HGPTC 0x04118 /* Host Good Packets TX Count */
-#define IGC_HTCBDPC 0x04124 /* Host TX Circ.Breaker Drop Count */
-
/* MSI-X Table Register Descriptions */
#define IGC_PBACL 0x05B68 /* MSIx PBA Clear - R/W 1 to clear */
#define IGC_MMDAC 13 /* MMD Access Control */
#define IGC_MMDAAD 14 /* MMD Access Address/Data */
-/* Good transmitted packets counter registers */
-#define IGC_PQGPTC(_n) (0x010014 + (0x100 * (_n)))
-
/* Statistics Register Descriptions */
#define IGC_CRCERRS 0x04000 /* CRC Error Count - R/clr */
#define IGC_ALGNERRC 0x04004 /* Alignment Error Count - R/clr */
#define IGC_HGOTCL 0x04130 /* Host Good Octets Transmit Count Low */
#define IGC_HGOTCH 0x04134 /* Host Good Octets Transmit Count High */
#define IGC_LENERRS 0x04138 /* Length Errors Count */
-#define IGC_HRMPC 0x0A018 /* Header Redirection Missed Packet Count */
/* Time sync registers */
#define IGC_TSICR 0x0B66C /* Time Sync Interrupt Cause */
};
struct ixgbe_rx_buffer {
- struct sk_buff *skb;
- dma_addr_t dma;
union {
struct {
+ struct sk_buff *skb;
+ dma_addr_t dma;
struct page *page;
__u32 page_offset;
__u16 pagecnt_bias;
};
struct {
- void *addr;
- u64 handle;
+ bool discard;
+ struct xdp_buff *xdp;
};
};
};
};
struct xdp_rxq_info xdp_rxq;
struct xdp_umem *xsk_umem;
- struct zero_copy_allocator zca; /* ZC allocator anchor */
u16 ring_idx; /* {rx,tx,xdp}_ring back reference idx */
u16 rx_buf_len;
} ____cacheline_internodealigned_in_smp;
#include <net/tc_act/tc_mirred.h>
#include <net/vxlan.h>
#include <net/mpls.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xfrm.h>
#include "ixgbe.h"
/* configure the packet buffer length */
if (rx_ring->xsk_umem) {
- u32 xsk_buf_len = rx_ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ u32 xsk_buf_len = xsk_umem_get_rx_frame_size(rx_ring->xsk_umem);
/* If the MAC support setting RXDCTL.RLPML, the
* SRRCTL[n].BSIZEPKT is set to PAGE_SIZE and
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
ring->xsk_umem = ixgbe_xsk_umem(adapter, ring);
if (ring->xsk_umem) {
- ring->zca.free = ixgbe_zca_free;
WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &ring->zca));
-
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL));
+ xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
} else {
WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED, NULL));
}
if (ring->xsk_umem && hw->mac.type != ixgbe_mac_82599EB) {
- u32 xsk_buf_len = ring->xsk_umem->chunk_size_nohr -
- XDP_PACKET_HEADROOM;
+ u32 xsk_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
IXGBE_RXDCTL_RLPML_EN);
void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
-void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
+bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *rx_ring,
const int budget);
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ixgbe.h"
return xdp_get_umem_from_qid(adapter->netdev, qid);
}
-static int ixgbe_xsk_umem_dma_map(struct ixgbe_adapter *adapter,
- struct xdp_umem *umem)
-{
- struct device *dev = &adapter->pdev->dev;
- unsigned int i, j;
- dma_addr_t dma;
-
- for (i = 0; i < umem->npgs; i++) {
- dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
- DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
- if (dma_mapping_error(dev, dma))
- goto out_unmap;
-
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-out_unmap:
- for (j = 0; j < i; j++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
- umem->pages[i].dma = 0;
- }
-
- return -1;
-}
-
-static void ixgbe_xsk_umem_dma_unmap(struct ixgbe_adapter *adapter,
- struct xdp_umem *umem)
-{
- struct device *dev = &adapter->pdev->dev;
- unsigned int i;
-
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
-
- umem->pages[i].dma = 0;
- }
-}
-
static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
struct xdp_umem *umem,
u16 qid)
{
struct net_device *netdev = adapter->netdev;
- struct xdp_umem_fq_reuse *reuseq;
bool if_running;
int err;
qid >= netdev->real_num_tx_queues)
return -EINVAL;
- reuseq = xsk_reuseq_prepare(adapter->rx_ring[0]->count);
- if (!reuseq)
- return -ENOMEM;
-
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- err = ixgbe_xsk_umem_dma_map(adapter, umem);
+ err = xsk_buff_dma_map(umem, &adapter->pdev->dev, IXGBE_RX_DMA_ATTR);
if (err)
return err;
ixgbe_txrx_ring_disable(adapter, qid);
clear_bit(qid, adapter->af_xdp_zc_qps);
- ixgbe_xsk_umem_dma_unmap(adapter, umem);
+ xsk_buff_dma_unmap(umem, IXGBE_RX_DMA_ATTR);
if (if_running)
ixgbe_txrx_ring_enable(adapter, qid);
struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp)
{
- struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = IXGBE_XDP_PASS;
struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
- u64 offset;
u32 act;
rcu_read_lock();
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
- offset = xdp->data - xdp->data_hard_start;
-
- xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
return result;
}
-static struct
-ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
- unsigned int size)
-{
- struct ixgbe_rx_buffer *bi;
-
- bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
-
- /* we are reusing so sync this buffer for CPU use */
- dma_sync_single_range_for_cpu(rx_ring->dev,
- bi->dma, 0,
- size,
- DMA_BIDIRECTIONAL);
-
- return bi;
-}
-
-static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *obi)
-{
- u16 nta = rx_ring->next_to_alloc;
- struct ixgbe_rx_buffer *nbi;
-
- nbi = &rx_ring->rx_buffer_info[rx_ring->next_to_alloc];
- /* update, and store next to alloc */
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- /* transfer page from old buffer to new buffer */
- nbi->dma = obi->dma;
- nbi->addr = obi->addr;
- nbi->handle = obi->handle;
-
- obi->addr = NULL;
- obi->skb = NULL;
-}
-
-void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
-{
- struct ixgbe_rx_buffer *bi;
- struct ixgbe_ring *rx_ring;
- u64 hr, mask;
- u16 nta;
-
- rx_ring = container_of(alloc, struct ixgbe_ring, zca);
- hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
- mask = rx_ring->xsk_umem->chunk_mask;
-
- nta = rx_ring->next_to_alloc;
- bi = rx_ring->rx_buffer_info;
-
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- handle &= mask;
-
- bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
- rx_ring->xsk_umem->headroom);
-}
-
-static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- void *addr = bi->addr;
- u64 handle, hr;
-
- if (addr)
- return true;
-
- if (!xsk_umem_peek_addr(umem, &handle)) {
- rx_ring->rx_stats.alloc_rx_page_failed++;
- return false;
- }
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr(umem);
- return true;
-}
-
-static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi)
-{
- struct xdp_umem *umem = rx_ring->xsk_umem;
- u64 handle, hr;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle)) {
- rx_ring->rx_stats.alloc_rx_page_failed++;
- return false;
- }
-
- handle &= rx_ring->xsk_umem->chunk_mask;
-
- hr = umem->headroom + XDP_PACKET_HEADROOM;
-
- bi->dma = xdp_umem_get_dma(umem, handle);
- bi->dma += hr;
-
- bi->addr = xdp_umem_get_data(umem, handle);
- bi->addr += hr;
-
- bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
- xsk_umem_release_addr_rq(umem);
- return true;
-}
-
-static __always_inline bool
-__ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
- bool alloc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi))
+bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
{
union ixgbe_adv_rx_desc *rx_desc;
struct ixgbe_rx_buffer *bi;
u16 i = rx_ring->next_to_use;
+ dma_addr_t dma;
bool ok = true;
/* nothing to do */
- if (!cleaned_count)
+ if (!count)
return true;
rx_desc = IXGBE_RX_DESC(rx_ring, i);
i -= rx_ring->count;
do {
- if (!alloc(rx_ring, bi)) {
+ bi->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+ if (!bi->xdp) {
ok = false;
break;
}
- /* sync the buffer for use by the device */
- dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
- bi->page_offset,
- rx_ring->rx_buf_len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_xdp_get_dma(bi->xdp);
/* Refresh the desc even if buffer_addrs didn't change
* because each write-back erases this info.
*/
- rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+ rx_desc->read.pkt_addr = cpu_to_le64(dma);
rx_desc++;
bi++;
/* clear the length for the next_to_use descriptor */
rx_desc->wb.upper.length = 0;
- cleaned_count--;
- } while (cleaned_count);
+ count--;
+ } while (count);
i += rx_ring->count;
if (rx_ring->next_to_use != i) {
rx_ring->next_to_use = i;
- /* update next to alloc since we have filled the ring */
- rx_ring->next_to_alloc = i;
-
/* Force memory writes to complete before letting h/w
* know there are new descriptors to fetch. (Only
* applicable for weak-ordered memory model archs,
return ok;
}
-void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
-{
- __ixgbe_alloc_rx_buffers_zc(rx_ring, count,
- ixgbe_alloc_buffer_slow_zc);
-}
-
-static bool ixgbe_alloc_rx_buffers_fast_zc(struct ixgbe_ring *rx_ring,
- u16 count)
-{
- return __ixgbe_alloc_rx_buffers_zc(rx_ring, count,
- ixgbe_alloc_buffer_zc);
-}
-
static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
- struct ixgbe_rx_buffer *bi,
- struct xdp_buff *xdp)
+ struct ixgbe_rx_buffer *bi)
{
- unsigned int metasize = xdp->data - xdp->data_meta;
- unsigned int datasize = xdp->data_end - xdp->data;
+ unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
+ unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
struct sk_buff *skb;
/* allocate a skb to store the frags */
skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
- xdp->data_end - xdp->data_hard_start,
+ bi->xdp->data_end - bi->xdp->data_hard_start,
GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
- skb_reserve(skb, xdp->data - xdp->data_hard_start);
- memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+ skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
+ memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
if (metasize)
skb_metadata_set(skb, metasize);
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
return skb;
}
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
struct ixgbe_adapter *adapter = q_vector->adapter;
u16 cleaned_count = ixgbe_desc_unused(rx_ring);
- struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
- struct xdp_buff xdp;
-
- xdp.rxq = &rx_ring->xdp_rxq;
- xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < budget)) {
union ixgbe_adv_rx_desc *rx_desc;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
failure = failure ||
- !ixgbe_alloc_rx_buffers_fast_zc(rx_ring,
- cleaned_count);
+ !ixgbe_alloc_rx_buffers_zc(rx_ring,
+ cleaned_count);
cleaned_count = 0;
}
*/
dma_rmb();
- bi = ixgbe_get_rx_buffer_zc(rx_ring, size);
+ bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
if (unlikely(!ixgbe_test_staterr(rx_desc,
IXGBE_RXD_STAT_EOP))) {
struct ixgbe_rx_buffer *next_bi;
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
ixgbe_inc_ntc(rx_ring);
next_bi =
&rx_ring->rx_buffer_info[rx_ring->next_to_clean];
- next_bi->skb = ERR_PTR(-EINVAL);
+ next_bi->discard = true;
continue;
}
- if (unlikely(bi->skb)) {
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+ if (unlikely(bi->discard)) {
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
+ bi->discard = false;
ixgbe_inc_ntc(rx_ring);
continue;
}
- xdp.data = bi->addr;
- xdp.data_meta = xdp.data;
- xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
- xdp.data_end = xdp.data + size;
- xdp.handle = bi->handle;
-
- xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, &xdp);
+ bi->xdp->data_end = bi->xdp->data + size;
+ xsk_buff_dma_sync_for_cpu(bi->xdp);
+ xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
if (xdp_res) {
- if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) {
+ if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR))
xdp_xmit |= xdp_res;
- bi->addr = NULL;
- bi->skb = NULL;
- } else {
- ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
- }
+ else
+ xsk_buff_free(bi->xdp);
+
+ bi->xdp = NULL;
total_rx_packets++;
total_rx_bytes += size;
}
/* XDP_PASS path */
- skb = ixgbe_construct_skb_zc(rx_ring, bi, &xdp);
+ skb = ixgbe_construct_skb_zc(rx_ring, bi);
if (!skb) {
rx_ring->rx_stats.alloc_rx_buff_failed++;
break;
void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring)
{
- u16 i = rx_ring->next_to_clean;
- struct ixgbe_rx_buffer *bi = &rx_ring->rx_buffer_info[i];
+ struct ixgbe_rx_buffer *bi;
+ u16 i;
- while (i != rx_ring->next_to_alloc) {
- xsk_umem_fq_reuse(rx_ring->xsk_umem, bi->handle);
- i++;
- bi++;
- if (i == rx_ring->count) {
- i = 0;
- bi = rx_ring->rx_buffer_info;
- }
+ for (i = 0; i < rx_ring->count; i++) {
+ bi = &rx_ring->rx_buffer_info[i];
+
+ if (!bi->xdp)
+ continue;
+
+ xsk_buff_free(bi->xdp);
+ bi->xdp = NULL;
}
}
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
- dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
- DMA_BIDIRECTIONAL);
+ dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+ xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+ desc.len);
tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;
MVNETA_CAUSE_LINK_CHANGE);
phylink_start(pp->phylink);
-
- /* We may have called phy_speed_down before */
- phy_speed_up(pp->dev->phydev);
-
netif_tx_start_all_queues(pp->dev);
}
{
unsigned int cpu;
- if (device_may_wakeup(&pp->dev->dev))
- phy_speed_down(pp->dev->phydev, false);
-
phylink_stop(pp->phylink);
if (!pp->neta_armada3700) {
phylink_ethtool_get_wol(pp->phylink, &wol);
device_set_wakeup_capable(&pp->dev->dev, !!wol.supported);
- /* PHY WoL may be enabled but device wakeup disabled */
- if (wol.supported)
- device_set_wakeup_enable(&pp->dev->dev, !!wol.wolopts);
-
return err;
}
# SPDX-License-Identifier: GPL-2.0-only
config NET_VENDOR_MEDIATEK
- bool "MediaTek ethernet driver"
+ bool "MediaTek devices"
depends on ARCH_MEDIATEK || SOC_MT7621 || SOC_MT7620
---help---
If you have a Mediatek SoC with ethernet, say Y.
This driver supports the gigabit ethernet MACs in the
MediaTek SoC family.
+config NET_MEDIATEK_STAR_EMAC
+ tristate "MediaTek STAR Ethernet MAC support"
+ select PHYLIB
+ help
+ This driver supports the ethernet MAC IP first used on
+ MediaTek MT85** SoCs.
+
endif #NET_VENDOR_MEDIATEK
# Makefile for the Mediatek SoCs built-in ethernet macs
#
-obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o
+obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o
mtk_eth-y := mtk_eth_soc.o mtk_sgmii.o mtk_eth_path.o
+obj-$(CONFIG_NET_MEDIATEK_STAR_EMAC) += mtk_star_emac.o
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 MediaTek Corporation
+ * Copyright (c) 2020 BayLibre SAS
+ *
+ * Author: Bartosz Golaszewski <bgolaszewski@baylibre.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/clk.h>
+#include <linux/compiler.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/regmap.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#define MTK_STAR_DRVNAME "mtk_star_emac"
+
+#define MTK_STAR_WAIT_TIMEOUT 300
+#define MTK_STAR_MAX_FRAME_SIZE 1514
+#define MTK_STAR_SKB_ALIGNMENT 16
+#define MTK_STAR_NAPI_WEIGHT 64
+#define MTK_STAR_HASHTABLE_MC_LIMIT 256
+#define MTK_STAR_HASHTABLE_SIZE_MAX 512
+
+/* Normally we'd use NET_IP_ALIGN but on arm64 its value is 0 and it doesn't
+ * work for this controller.
+ */
+#define MTK_STAR_IP_ALIGN 2
+
+static const char *const mtk_star_clk_names[] = { "core", "reg", "trans" };
+#define MTK_STAR_NCLKS ARRAY_SIZE(mtk_star_clk_names)
+
+/* PHY Control Register 0 */
+#define MTK_STAR_REG_PHY_CTRL0 0x0000
+#define MTK_STAR_BIT_PHY_CTRL0_WTCMD BIT(13)
+#define MTK_STAR_BIT_PHY_CTRL0_RDCMD BIT(14)
+#define MTK_STAR_BIT_PHY_CTRL0_RWOK BIT(15)
+#define MTK_STAR_MSK_PHY_CTRL0_PREG GENMASK(12, 8)
+#define MTK_STAR_OFF_PHY_CTRL0_PREG 8
+#define MTK_STAR_MSK_PHY_CTRL0_RWDATA GENMASK(31, 16)
+#define MTK_STAR_OFF_PHY_CTRL0_RWDATA 16
+
+/* PHY Control Register 1 */
+#define MTK_STAR_REG_PHY_CTRL1 0x0004
+#define MTK_STAR_BIT_PHY_CTRL1_LINK_ST BIT(0)
+#define MTK_STAR_BIT_PHY_CTRL1_AN_EN BIT(8)
+#define MTK_STAR_OFF_PHY_CTRL1_FORCE_SPD 9
+#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_10M 0x00
+#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_100M 0x01
+#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_1000M 0x02
+#define MTK_STAR_BIT_PHY_CTRL1_FORCE_DPX BIT(11)
+#define MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_RX BIT(12)
+#define MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_TX BIT(13)
+
+/* MAC Configuration Register */
+#define MTK_STAR_REG_MAC_CFG 0x0008
+#define MTK_STAR_OFF_MAC_CFG_IPG 10
+#define MTK_STAR_VAL_MAC_CFG_IPG_96BIT GENMASK(4, 0)
+#define MTK_STAR_BIT_MAC_CFG_MAXLEN_1522 BIT(16)
+#define MTK_STAR_BIT_MAC_CFG_AUTO_PAD BIT(19)
+#define MTK_STAR_BIT_MAC_CFG_CRC_STRIP BIT(20)
+#define MTK_STAR_BIT_MAC_CFG_VLAN_STRIP BIT(22)
+#define MTK_STAR_BIT_MAC_CFG_NIC_PD BIT(31)
+
+/* Flow-Control Configuration Register */
+#define MTK_STAR_REG_FC_CFG 0x000c
+#define MTK_STAR_BIT_FC_CFG_BP_EN BIT(7)
+#define MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR BIT(8)
+#define MTK_STAR_OFF_FC_CFG_SEND_PAUSE_TH 16
+#define MTK_STAR_MSK_FC_CFG_SEND_PAUSE_TH GENMASK(27, 16)
+#define MTK_STAR_VAL_FC_CFG_SEND_PAUSE_TH_2K 0x800
+
+/* ARL Configuration Register */
+#define MTK_STAR_REG_ARL_CFG 0x0010
+#define MTK_STAR_BIT_ARL_CFG_HASH_ALG BIT(0)
+#define MTK_STAR_BIT_ARL_CFG_MISC_MODE BIT(4)
+
+/* MAC High and Low Bytes Registers */
+#define MTK_STAR_REG_MY_MAC_H 0x0014
+#define MTK_STAR_REG_MY_MAC_L 0x0018
+
+/* Hash Table Control Register */
+#define MTK_STAR_REG_HASH_CTRL 0x001c
+#define MTK_STAR_MSK_HASH_CTRL_HASH_BIT_ADDR GENMASK(8, 0)
+#define MTK_STAR_BIT_HASH_CTRL_HASH_BIT_DATA BIT(12)
+#define MTK_STAR_BIT_HASH_CTRL_ACC_CMD BIT(13)
+#define MTK_STAR_BIT_HASH_CTRL_CMD_START BIT(14)
+#define MTK_STAR_BIT_HASH_CTRL_BIST_OK BIT(16)
+#define MTK_STAR_BIT_HASH_CTRL_BIST_DONE BIT(17)
+#define MTK_STAR_BIT_HASH_CTRL_BIST_EN BIT(31)
+
+/* TX DMA Control Register */
+#define MTK_STAR_REG_TX_DMA_CTRL 0x0034
+#define MTK_STAR_BIT_TX_DMA_CTRL_START BIT(0)
+#define MTK_STAR_BIT_TX_DMA_CTRL_STOP BIT(1)
+#define MTK_STAR_BIT_TX_DMA_CTRL_RESUME BIT(2)
+
+/* RX DMA Control Register */
+#define MTK_STAR_REG_RX_DMA_CTRL 0x0038
+#define MTK_STAR_BIT_RX_DMA_CTRL_START BIT(0)
+#define MTK_STAR_BIT_RX_DMA_CTRL_STOP BIT(1)
+#define MTK_STAR_BIT_RX_DMA_CTRL_RESUME BIT(2)
+
+/* DMA Address Registers */
+#define MTK_STAR_REG_TX_DPTR 0x003c
+#define MTK_STAR_REG_RX_DPTR 0x0040
+#define MTK_STAR_REG_TX_BASE_ADDR 0x0044
+#define MTK_STAR_REG_RX_BASE_ADDR 0x0048
+
+/* Interrupt Status Register */
+#define MTK_STAR_REG_INT_STS 0x0050
+#define MTK_STAR_REG_INT_STS_PORT_STS_CHG BIT(2)
+#define MTK_STAR_REG_INT_STS_MIB_CNT_TH BIT(3)
+#define MTK_STAR_BIT_INT_STS_FNRC BIT(6)
+#define MTK_STAR_BIT_INT_STS_TNTC BIT(8)
+
+/* Interrupt Mask Register */
+#define MTK_STAR_REG_INT_MASK 0x0054
+#define MTK_STAR_BIT_INT_MASK_FNRC BIT(6)
+
+/* Misc. Config Register */
+#define MTK_STAR_REG_TEST1 0x005c
+#define MTK_STAR_BIT_TEST1_RST_HASH_MBIST BIT(31)
+
+/* Extended Configuration Register */
+#define MTK_STAR_REG_EXT_CFG 0x0060
+#define MTK_STAR_OFF_EXT_CFG_SND_PAUSE_RLS 16
+#define MTK_STAR_MSK_EXT_CFG_SND_PAUSE_RLS GENMASK(26, 16)
+#define MTK_STAR_VAL_EXT_CFG_SND_PAUSE_RLS_1K 0x400
+
+/* EthSys Configuration Register */
+#define MTK_STAR_REG_SYS_CONF 0x0094
+#define MTK_STAR_BIT_MII_PAD_OUT_ENABLE BIT(0)
+#define MTK_STAR_BIT_EXT_MDC_MODE BIT(1)
+#define MTK_STAR_BIT_SWC_MII_MODE BIT(2)
+
+/* MAC Clock Configuration Register */
+#define MTK_STAR_REG_MAC_CLK_CONF 0x00ac
+#define MTK_STAR_MSK_MAC_CLK_CONF GENMASK(7, 0)
+#define MTK_STAR_BIT_CLK_DIV_10 0x0a
+
+/* Counter registers. */
+#define MTK_STAR_REG_C_RXOKPKT 0x0100
+#define MTK_STAR_REG_C_RXOKBYTE 0x0104
+#define MTK_STAR_REG_C_RXRUNT 0x0108
+#define MTK_STAR_REG_C_RXLONG 0x010c
+#define MTK_STAR_REG_C_RXDROP 0x0110
+#define MTK_STAR_REG_C_RXCRC 0x0114
+#define MTK_STAR_REG_C_RXARLDROP 0x0118
+#define MTK_STAR_REG_C_RXVLANDROP 0x011c
+#define MTK_STAR_REG_C_RXCSERR 0x0120
+#define MTK_STAR_REG_C_RXPAUSE 0x0124
+#define MTK_STAR_REG_C_TXOKPKT 0x0128
+#define MTK_STAR_REG_C_TXOKBYTE 0x012c
+#define MTK_STAR_REG_C_TXPAUSECOL 0x0130
+#define MTK_STAR_REG_C_TXRTY 0x0134
+#define MTK_STAR_REG_C_TXSKIP 0x0138
+#define MTK_STAR_REG_C_TX_ARP 0x013c
+#define MTK_STAR_REG_C_RX_RERR 0x01d8
+#define MTK_STAR_REG_C_RX_UNI 0x01dc
+#define MTK_STAR_REG_C_RX_MULTI 0x01e0
+#define MTK_STAR_REG_C_RX_BROAD 0x01e4
+#define MTK_STAR_REG_C_RX_ALIGNERR 0x01e8
+#define MTK_STAR_REG_C_TX_UNI 0x01ec
+#define MTK_STAR_REG_C_TX_MULTI 0x01f0
+#define MTK_STAR_REG_C_TX_BROAD 0x01f4
+#define MTK_STAR_REG_C_TX_TIMEOUT 0x01f8
+#define MTK_STAR_REG_C_TX_LATECOL 0x01fc
+#define MTK_STAR_REG_C_RX_LENGTHERR 0x0214
+#define MTK_STAR_REG_C_RX_TWIST 0x0218
+
+/* Ethernet CFG Control */
+#define MTK_PERICFG_REG_NIC_CFG_CON 0x03c4
+#define MTK_PERICFG_MSK_NIC_CFG_CON_CFG_MII GENMASK(3, 0)
+#define MTK_PERICFG_BIT_NIC_CFG_CON_RMII BIT(0)
+
+/* Represents the actual structure of descriptors used by the MAC. We can
+ * reuse the same structure for both TX and RX - the layout is the same, only
+ * the flags differ slightly.
+ */
+struct mtk_star_ring_desc {
+ /* Contains both the status flags as well as packet length. */
+ u32 status;
+ u32 data_ptr;
+ u32 vtag;
+ u32 reserved;
+};
+
+#define MTK_STAR_DESC_MSK_LEN GENMASK(15, 0)
+#define MTK_STAR_DESC_BIT_RX_CRCE BIT(24)
+#define MTK_STAR_DESC_BIT_RX_OSIZE BIT(25)
+#define MTK_STAR_DESC_BIT_INT BIT(27)
+#define MTK_STAR_DESC_BIT_LS BIT(28)
+#define MTK_STAR_DESC_BIT_FS BIT(29)
+#define MTK_STAR_DESC_BIT_EOR BIT(30)
+#define MTK_STAR_DESC_BIT_COWN BIT(31)
+
+/* Helper structure for storing data read from/written to descriptors in order
+ * to limit reads from/writes to DMA memory.
+ */
+struct mtk_star_ring_desc_data {
+ unsigned int len;
+ unsigned int flags;
+ dma_addr_t dma_addr;
+ struct sk_buff *skb;
+};
+
+#define MTK_STAR_RING_NUM_DESCS 128
+#define MTK_STAR_NUM_TX_DESCS MTK_STAR_RING_NUM_DESCS
+#define MTK_STAR_NUM_RX_DESCS MTK_STAR_RING_NUM_DESCS
+#define MTK_STAR_NUM_DESCS_TOTAL (MTK_STAR_RING_NUM_DESCS * 2)
+#define MTK_STAR_DMA_SIZE \
+ (MTK_STAR_NUM_DESCS_TOTAL * sizeof(struct mtk_star_ring_desc))
+
+struct mtk_star_ring {
+ struct mtk_star_ring_desc *descs;
+ struct sk_buff *skbs[MTK_STAR_RING_NUM_DESCS];
+ dma_addr_t dma_addrs[MTK_STAR_RING_NUM_DESCS];
+ unsigned int head;
+ unsigned int tail;
+};
+
+struct mtk_star_priv {
+ struct net_device *ndev;
+
+ struct regmap *regs;
+ struct regmap *pericfg;
+
+ struct clk_bulk_data clks[MTK_STAR_NCLKS];
+
+ void *ring_base;
+ struct mtk_star_ring_desc *descs_base;
+ dma_addr_t dma_addr;
+ struct mtk_star_ring tx_ring;
+ struct mtk_star_ring rx_ring;
+
+ struct mii_bus *mii;
+ struct napi_struct napi;
+
+ struct device_node *phy_node;
+ phy_interface_t phy_intf;
+ struct phy_device *phydev;
+ unsigned int link;
+ int speed;
+ int duplex;
+ int pause;
+
+ /* Protects against concurrent descriptor access. */
+ spinlock_t lock;
+
+ struct rtnl_link_stats64 stats;
+ struct work_struct stats_work;
+};
+
+static struct device *mtk_star_get_dev(struct mtk_star_priv *priv)
+{
+ return priv->ndev->dev.parent;
+}
+
+static const struct regmap_config mtk_star_regmap_config = {
+ .reg_bits = 32,
+ .val_bits = 32,
+ .reg_stride = 4,
+ .disable_locking = true,
+};
+
+static void mtk_star_ring_init(struct mtk_star_ring *ring,
+ struct mtk_star_ring_desc *descs)
+{
+ memset(ring, 0, sizeof(*ring));
+ ring->descs = descs;
+ ring->head = 0;
+ ring->tail = 0;
+}
+
+static int mtk_star_ring_pop_tail(struct mtk_star_ring *ring,
+ struct mtk_star_ring_desc_data *desc_data)
+{
+ struct mtk_star_ring_desc *desc = &ring->descs[ring->tail];
+ unsigned int status;
+
+ status = READ_ONCE(desc->status);
+ dma_rmb(); /* Make sure we read the status bits before checking it. */
+
+ if (!(status & MTK_STAR_DESC_BIT_COWN))
+ return -1;
+
+ desc_data->len = status & MTK_STAR_DESC_MSK_LEN;
+ desc_data->flags = status & ~MTK_STAR_DESC_MSK_LEN;
+ desc_data->dma_addr = ring->dma_addrs[ring->tail];
+ desc_data->skb = ring->skbs[ring->tail];
+
+ ring->dma_addrs[ring->tail] = 0;
+ ring->skbs[ring->tail] = NULL;
+
+ status &= MTK_STAR_DESC_BIT_COWN | MTK_STAR_DESC_BIT_EOR;
+
+ WRITE_ONCE(desc->data_ptr, 0);
+ WRITE_ONCE(desc->status, status);
+
+ ring->tail = (ring->tail + 1) % MTK_STAR_RING_NUM_DESCS;
+
+ return 0;
+}
+
+static void mtk_star_ring_push_head(struct mtk_star_ring *ring,
+ struct mtk_star_ring_desc_data *desc_data,
+ unsigned int flags)
+{
+ struct mtk_star_ring_desc *desc = &ring->descs[ring->head];
+ unsigned int status;
+
+ status = READ_ONCE(desc->status);
+
+ ring->skbs[ring->head] = desc_data->skb;
+ ring->dma_addrs[ring->head] = desc_data->dma_addr;
+
+ status |= desc_data->len;
+ if (flags)
+ status |= flags;
+
+ WRITE_ONCE(desc->data_ptr, desc_data->dma_addr);
+ WRITE_ONCE(desc->status, status);
+ status &= ~MTK_STAR_DESC_BIT_COWN;
+ /* Flush previous modifications before ownership change. */
+ dma_wmb();
+ WRITE_ONCE(desc->status, status);
+
+ ring->head = (ring->head + 1) % MTK_STAR_RING_NUM_DESCS;
+}
+
+static void
+mtk_star_ring_push_head_rx(struct mtk_star_ring *ring,
+ struct mtk_star_ring_desc_data *desc_data)
+{
+ mtk_star_ring_push_head(ring, desc_data, 0);
+}
+
+static void
+mtk_star_ring_push_head_tx(struct mtk_star_ring *ring,
+ struct mtk_star_ring_desc_data *desc_data)
+{
+ static const unsigned int flags = MTK_STAR_DESC_BIT_FS |
+ MTK_STAR_DESC_BIT_LS |
+ MTK_STAR_DESC_BIT_INT;
+
+ mtk_star_ring_push_head(ring, desc_data, flags);
+}
+
+static unsigned int mtk_star_ring_num_used_descs(struct mtk_star_ring *ring)
+{
+ return abs(ring->head - ring->tail);
+}
+
+static bool mtk_star_ring_full(struct mtk_star_ring *ring)
+{
+ return mtk_star_ring_num_used_descs(ring) == MTK_STAR_RING_NUM_DESCS;
+}
+
+static bool mtk_star_ring_descs_available(struct mtk_star_ring *ring)
+{
+ return mtk_star_ring_num_used_descs(ring) > 0;
+}
+
+static dma_addr_t mtk_star_dma_map_rx(struct mtk_star_priv *priv,
+ struct sk_buff *skb)
+{
+ struct device *dev = mtk_star_get_dev(priv);
+
+ /* Data pointer for the RX DMA descriptor must be aligned to 4N + 2. */
+ return dma_map_single(dev, skb_tail_pointer(skb) - 2,
+ skb_tailroom(skb), DMA_FROM_DEVICE);
+}
+
+static void mtk_star_dma_unmap_rx(struct mtk_star_priv *priv,
+ struct mtk_star_ring_desc_data *desc_data)
+{
+ struct device *dev = mtk_star_get_dev(priv);
+
+ dma_unmap_single(dev, desc_data->dma_addr,
+ skb_tailroom(desc_data->skb), DMA_FROM_DEVICE);
+}
+
+static dma_addr_t mtk_star_dma_map_tx(struct mtk_star_priv *priv,
+ struct sk_buff *skb)
+{
+ struct device *dev = mtk_star_get_dev(priv);
+
+ return dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
+}
+
+static void mtk_star_dma_unmap_tx(struct mtk_star_priv *priv,
+ struct mtk_star_ring_desc_data *desc_data)
+{
+ struct device *dev = mtk_star_get_dev(priv);
+
+ return dma_unmap_single(dev, desc_data->dma_addr,
+ skb_headlen(desc_data->skb), DMA_TO_DEVICE);
+}
+
+static void mtk_star_nic_disable_pd(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG,
+ MTK_STAR_BIT_MAC_CFG_NIC_PD, 0);
+}
+
+/* Unmask the three interrupts we care about, mask all others. */
+static void mtk_star_intr_enable(struct mtk_star_priv *priv)
+{
+ unsigned int val = MTK_STAR_BIT_INT_STS_TNTC |
+ MTK_STAR_BIT_INT_STS_FNRC |
+ MTK_STAR_REG_INT_STS_MIB_CNT_TH;
+
+ regmap_write(priv->regs, MTK_STAR_REG_INT_MASK, ~val);
+}
+
+static void mtk_star_intr_disable(struct mtk_star_priv *priv)
+{
+ regmap_write(priv->regs, MTK_STAR_REG_INT_MASK, ~0);
+}
+
+static void mtk_star_intr_enable_tx(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+ MTK_STAR_BIT_INT_STS_TNTC, 0);
+}
+
+static void mtk_star_intr_enable_rx(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+ MTK_STAR_BIT_INT_STS_FNRC, 0);
+}
+
+static void mtk_star_intr_enable_stats(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+ MTK_STAR_REG_INT_STS_MIB_CNT_TH, 0);
+}
+
+static void mtk_star_intr_disable_tx(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+ MTK_STAR_BIT_INT_STS_TNTC,
+ MTK_STAR_BIT_INT_STS_TNTC);
+}
+
+static void mtk_star_intr_disable_rx(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+ MTK_STAR_BIT_INT_STS_FNRC,
+ MTK_STAR_BIT_INT_STS_FNRC);
+}
+
+static void mtk_star_intr_disable_stats(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+ MTK_STAR_REG_INT_STS_MIB_CNT_TH,
+ MTK_STAR_REG_INT_STS_MIB_CNT_TH);
+}
+
+static unsigned int mtk_star_intr_read(struct mtk_star_priv *priv)
+{
+ unsigned int val;
+
+ regmap_read(priv->regs, MTK_STAR_REG_INT_STS, &val);
+
+ return val;
+}
+
+static unsigned int mtk_star_intr_ack_all(struct mtk_star_priv *priv)
+{
+ unsigned int val;
+
+ val = mtk_star_intr_read(priv);
+ regmap_write(priv->regs, MTK_STAR_REG_INT_STS, val);
+
+ return val;
+}
+
+static void mtk_star_dma_init(struct mtk_star_priv *priv)
+{
+ struct mtk_star_ring_desc *desc;
+ unsigned int val;
+ int i;
+
+ priv->descs_base = (struct mtk_star_ring_desc *)priv->ring_base;
+
+ for (i = 0; i < MTK_STAR_NUM_DESCS_TOTAL; i++) {
+ desc = &priv->descs_base[i];
+
+ memset(desc, 0, sizeof(*desc));
+ desc->status = MTK_STAR_DESC_BIT_COWN;
+ if ((i == MTK_STAR_NUM_TX_DESCS - 1) ||
+ (i == MTK_STAR_NUM_DESCS_TOTAL - 1))
+ desc->status |= MTK_STAR_DESC_BIT_EOR;
+ }
+
+ mtk_star_ring_init(&priv->tx_ring, priv->descs_base);
+ mtk_star_ring_init(&priv->rx_ring,
+ priv->descs_base + MTK_STAR_NUM_TX_DESCS);
+
+ /* Set DMA pointers. */
+ val = (unsigned int)priv->dma_addr;
+ regmap_write(priv->regs, MTK_STAR_REG_TX_BASE_ADDR, val);
+ regmap_write(priv->regs, MTK_STAR_REG_TX_DPTR, val);
+
+ val += sizeof(struct mtk_star_ring_desc) * MTK_STAR_NUM_TX_DESCS;
+ regmap_write(priv->regs, MTK_STAR_REG_RX_BASE_ADDR, val);
+ regmap_write(priv->regs, MTK_STAR_REG_RX_DPTR, val);
+}
+
+static void mtk_star_dma_start(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL,
+ MTK_STAR_BIT_TX_DMA_CTRL_START,
+ MTK_STAR_BIT_TX_DMA_CTRL_START);
+ regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL,
+ MTK_STAR_BIT_RX_DMA_CTRL_START,
+ MTK_STAR_BIT_RX_DMA_CTRL_START);
+}
+
+static void mtk_star_dma_stop(struct mtk_star_priv *priv)
+{
+ regmap_write(priv->regs, MTK_STAR_REG_TX_DMA_CTRL,
+ MTK_STAR_BIT_TX_DMA_CTRL_STOP);
+ regmap_write(priv->regs, MTK_STAR_REG_RX_DMA_CTRL,
+ MTK_STAR_BIT_RX_DMA_CTRL_STOP);
+}
+
+static void mtk_star_dma_disable(struct mtk_star_priv *priv)
+{
+ int i;
+
+ mtk_star_dma_stop(priv);
+
+ /* Take back all descriptors. */
+ for (i = 0; i < MTK_STAR_NUM_DESCS_TOTAL; i++)
+ priv->descs_base[i].status |= MTK_STAR_DESC_BIT_COWN;
+}
+
+static void mtk_star_dma_resume_rx(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL,
+ MTK_STAR_BIT_RX_DMA_CTRL_RESUME,
+ MTK_STAR_BIT_RX_DMA_CTRL_RESUME);
+}
+
+static void mtk_star_dma_resume_tx(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL,
+ MTK_STAR_BIT_TX_DMA_CTRL_RESUME,
+ MTK_STAR_BIT_TX_DMA_CTRL_RESUME);
+}
+
+static void mtk_star_set_mac_addr(struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+ u8 *mac_addr = ndev->dev_addr;
+ unsigned int high, low;
+
+ high = mac_addr[0] << 8 | mac_addr[1] << 0;
+ low = mac_addr[2] << 24 | mac_addr[3] << 16 |
+ mac_addr[4] << 8 | mac_addr[5];
+
+ regmap_write(priv->regs, MTK_STAR_REG_MY_MAC_H, high);
+ regmap_write(priv->regs, MTK_STAR_REG_MY_MAC_L, low);
+}
+
+static void mtk_star_reset_counters(struct mtk_star_priv *priv)
+{
+ static const unsigned int counter_regs[] = {
+ MTK_STAR_REG_C_RXOKPKT,
+ MTK_STAR_REG_C_RXOKBYTE,
+ MTK_STAR_REG_C_RXRUNT,
+ MTK_STAR_REG_C_RXLONG,
+ MTK_STAR_REG_C_RXDROP,
+ MTK_STAR_REG_C_RXCRC,
+ MTK_STAR_REG_C_RXARLDROP,
+ MTK_STAR_REG_C_RXVLANDROP,
+ MTK_STAR_REG_C_RXCSERR,
+ MTK_STAR_REG_C_RXPAUSE,
+ MTK_STAR_REG_C_TXOKPKT,
+ MTK_STAR_REG_C_TXOKBYTE,
+ MTK_STAR_REG_C_TXPAUSECOL,
+ MTK_STAR_REG_C_TXRTY,
+ MTK_STAR_REG_C_TXSKIP,
+ MTK_STAR_REG_C_TX_ARP,
+ MTK_STAR_REG_C_RX_RERR,
+ MTK_STAR_REG_C_RX_UNI,
+ MTK_STAR_REG_C_RX_MULTI,
+ MTK_STAR_REG_C_RX_BROAD,
+ MTK_STAR_REG_C_RX_ALIGNERR,
+ MTK_STAR_REG_C_TX_UNI,
+ MTK_STAR_REG_C_TX_MULTI,
+ MTK_STAR_REG_C_TX_BROAD,
+ MTK_STAR_REG_C_TX_TIMEOUT,
+ MTK_STAR_REG_C_TX_LATECOL,
+ MTK_STAR_REG_C_RX_LENGTHERR,
+ MTK_STAR_REG_C_RX_TWIST,
+ };
+
+ unsigned int i, val;
+
+ for (i = 0; i < ARRAY_SIZE(counter_regs); i++)
+ regmap_read(priv->regs, counter_regs[i], &val);
+}
+
+static void mtk_star_update_stat(struct mtk_star_priv *priv,
+ unsigned int reg, u64 *stat)
+{
+ unsigned int val;
+
+ regmap_read(priv->regs, reg, &val);
+ *stat += val;
+}
+
+/* Try to get as many stats as possible from the internal registers instead
+ * of tracking them ourselves.
+ */
+static void mtk_star_update_stats(struct mtk_star_priv *priv)
+{
+ struct rtnl_link_stats64 *stats = &priv->stats;
+
+ /* OK packets and bytes. */
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RXOKPKT, &stats->rx_packets);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_TXOKPKT, &stats->tx_packets);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RXOKBYTE, &stats->rx_bytes);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_TXOKBYTE, &stats->tx_bytes);
+
+ /* RX & TX multicast. */
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_MULTI, &stats->multicast);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_TX_MULTI, &stats->multicast);
+
+ /* Collisions. */
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_TXPAUSECOL,
+ &stats->collisions);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_TX_LATECOL,
+ &stats->collisions);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RXRUNT, &stats->collisions);
+
+ /* RX Errors. */
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_LENGTHERR,
+ &stats->rx_length_errors);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RXLONG,
+ &stats->rx_over_errors);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RXCRC, &stats->rx_crc_errors);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_ALIGNERR,
+ &stats->rx_frame_errors);
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RXDROP,
+ &stats->rx_fifo_errors);
+ /* Sum of the general RX error counter + all of the above. */
+ mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_RERR, &stats->rx_errors);
+ stats->rx_errors += stats->rx_length_errors;
+ stats->rx_errors += stats->rx_over_errors;
+ stats->rx_errors += stats->rx_crc_errors;
+ stats->rx_errors += stats->rx_frame_errors;
+ stats->rx_errors += stats->rx_fifo_errors;
+}
+
+/* This runs in process context and parallel TX and RX paths executing in
+ * napi context may result in losing some stats data but this should happen
+ * seldom enough to be acceptable.
+ */
+static void mtk_star_update_stats_work(struct work_struct *work)
+{
+ struct mtk_star_priv *priv = container_of(work, struct mtk_star_priv,
+ stats_work);
+
+ mtk_star_update_stats(priv);
+ mtk_star_reset_counters(priv);
+ mtk_star_intr_enable_stats(priv);
+}
+
+static struct sk_buff *mtk_star_alloc_skb(struct net_device *ndev)
+{
+ uintptr_t tail, offset;
+ struct sk_buff *skb;
+
+ skb = dev_alloc_skb(MTK_STAR_MAX_FRAME_SIZE);
+ if (!skb)
+ return NULL;
+
+ /* Align to 16 bytes. */
+ tail = (uintptr_t)skb_tail_pointer(skb);
+ if (tail & (MTK_STAR_SKB_ALIGNMENT - 1)) {
+ offset = tail & (MTK_STAR_SKB_ALIGNMENT - 1);
+ skb_reserve(skb, MTK_STAR_SKB_ALIGNMENT - offset);
+ }
+
+ /* Ensure 16-byte alignment of the skb pointer: eth_type_trans() will
+ * extract the Ethernet header (14 bytes) so we need two more bytes.
+ */
+ skb_reserve(skb, MTK_STAR_IP_ALIGN);
+
+ return skb;
+}
+
+static int mtk_star_prepare_rx_skbs(struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+ struct mtk_star_ring *ring = &priv->rx_ring;
+ struct device *dev = mtk_star_get_dev(priv);
+ struct mtk_star_ring_desc *desc;
+ struct sk_buff *skb;
+ dma_addr_t dma_addr;
+ int i;
+
+ for (i = 0; i < MTK_STAR_NUM_RX_DESCS; i++) {
+ skb = mtk_star_alloc_skb(ndev);
+ if (!skb)
+ return -ENOMEM;
+
+ dma_addr = mtk_star_dma_map_rx(priv, skb);
+ if (dma_mapping_error(dev, dma_addr)) {
+ dev_kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ desc = &ring->descs[i];
+ desc->data_ptr = dma_addr;
+ desc->status |= skb_tailroom(skb) & MTK_STAR_DESC_MSK_LEN;
+ desc->status &= ~MTK_STAR_DESC_BIT_COWN;
+ ring->skbs[i] = skb;
+ ring->dma_addrs[i] = dma_addr;
+ }
+
+ return 0;
+}
+
+static void
+mtk_star_ring_free_skbs(struct mtk_star_priv *priv, struct mtk_star_ring *ring,
+ void (*unmap_func)(struct mtk_star_priv *,
+ struct mtk_star_ring_desc_data *))
+{
+ struct mtk_star_ring_desc_data desc_data;
+ struct mtk_star_ring_desc *desc;
+ int i;
+
+ for (i = 0; i < MTK_STAR_RING_NUM_DESCS; i++) {
+ if (!ring->dma_addrs[i])
+ continue;
+
+ desc = &ring->descs[i];
+
+ desc_data.dma_addr = ring->dma_addrs[i];
+ desc_data.skb = ring->skbs[i];
+
+ unmap_func(priv, &desc_data);
+ dev_kfree_skb(desc_data.skb);
+ }
+}
+
+static void mtk_star_free_rx_skbs(struct mtk_star_priv *priv)
+{
+ struct mtk_star_ring *ring = &priv->rx_ring;
+
+ mtk_star_ring_free_skbs(priv, ring, mtk_star_dma_unmap_rx);
+}
+
+static void mtk_star_free_tx_skbs(struct mtk_star_priv *priv)
+{
+ struct mtk_star_ring *ring = &priv->tx_ring;
+
+ mtk_star_ring_free_skbs(priv, ring, mtk_star_dma_unmap_tx);
+}
+
+/* All processing for TX and RX happens in the napi poll callback. */
+static irqreturn_t mtk_star_handle_irq(int irq, void *data)
+{
+ struct mtk_star_priv *priv;
+ struct net_device *ndev;
+ bool need_napi = false;
+ unsigned int status;
+
+ ndev = data;
+ priv = netdev_priv(ndev);
+
+ if (netif_running(ndev)) {
+ status = mtk_star_intr_read(priv);
+
+ if (status & MTK_STAR_BIT_INT_STS_TNTC) {
+ mtk_star_intr_disable_tx(priv);
+ need_napi = true;
+ }
+
+ if (status & MTK_STAR_BIT_INT_STS_FNRC) {
+ mtk_star_intr_disable_rx(priv);
+ need_napi = true;
+ }
+
+ if (need_napi)
+ napi_schedule(&priv->napi);
+
+ /* One of the counters reached 0x8000000 - update stats and
+ * reset all counters.
+ */
+ if (unlikely(status & MTK_STAR_REG_INT_STS_MIB_CNT_TH)) {
+ mtk_star_intr_disable_stats(priv);
+ schedule_work(&priv->stats_work);
+ }
+
+ mtk_star_intr_ack_all(priv);
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* Wait for the completion of any previous command - CMD_START bit must be
+ * cleared by hardware.
+ */
+static int mtk_star_hash_wait_cmd_start(struct mtk_star_priv *priv)
+{
+ unsigned int val;
+
+ return regmap_read_poll_timeout_atomic(priv->regs,
+ MTK_STAR_REG_HASH_CTRL, val,
+ !(val & MTK_STAR_BIT_HASH_CTRL_CMD_START),
+ 10, MTK_STAR_WAIT_TIMEOUT);
+}
+
+static int mtk_star_hash_wait_ok(struct mtk_star_priv *priv)
+{
+ unsigned int val;
+ int ret;
+
+ /* Wait for BIST_DONE bit. */
+ ret = regmap_read_poll_timeout_atomic(priv->regs,
+ MTK_STAR_REG_HASH_CTRL, val,
+ val & MTK_STAR_BIT_HASH_CTRL_BIST_DONE,
+ 10, MTK_STAR_WAIT_TIMEOUT);
+ if (ret)
+ return ret;
+
+ /* Check the BIST_OK bit. */
+ regmap_read(priv->regs, MTK_STAR_REG_HASH_CTRL, &val);
+ if (!(val & MTK_STAR_BIT_HASH_CTRL_BIST_OK))
+ return -EIO;
+
+ return 0;
+}
+
+static int mtk_star_set_hashbit(struct mtk_star_priv *priv,
+ unsigned int hash_addr)
+{
+ unsigned int val;
+ int ret;
+
+ ret = mtk_star_hash_wait_cmd_start(priv);
+ if (ret)
+ return ret;
+
+ val = hash_addr & MTK_STAR_MSK_HASH_CTRL_HASH_BIT_ADDR;
+ val |= MTK_STAR_BIT_HASH_CTRL_ACC_CMD;
+ val |= MTK_STAR_BIT_HASH_CTRL_CMD_START;
+ val |= MTK_STAR_BIT_HASH_CTRL_BIST_EN;
+ val |= MTK_STAR_BIT_HASH_CTRL_HASH_BIT_DATA;
+ regmap_write(priv->regs, MTK_STAR_REG_HASH_CTRL, val);
+
+ return mtk_star_hash_wait_ok(priv);
+}
+
+static int mtk_star_reset_hash_table(struct mtk_star_priv *priv)
+{
+ int ret;
+
+ ret = mtk_star_hash_wait_cmd_start(priv);
+ if (ret)
+ return ret;
+
+ regmap_update_bits(priv->regs, MTK_STAR_REG_HASH_CTRL,
+ MTK_STAR_BIT_HASH_CTRL_BIST_EN,
+ MTK_STAR_BIT_HASH_CTRL_BIST_EN);
+ regmap_update_bits(priv->regs, MTK_STAR_REG_TEST1,
+ MTK_STAR_BIT_TEST1_RST_HASH_MBIST,
+ MTK_STAR_BIT_TEST1_RST_HASH_MBIST);
+
+ return mtk_star_hash_wait_ok(priv);
+}
+
+static void mtk_star_phy_config(struct mtk_star_priv *priv)
+{
+ unsigned int val;
+
+ if (priv->speed == SPEED_1000)
+ val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_1000M;
+ else if (priv->speed == SPEED_100)
+ val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_100M;
+ else
+ val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_10M;
+ val <<= MTK_STAR_OFF_PHY_CTRL1_FORCE_SPD;
+
+ val |= MTK_STAR_BIT_PHY_CTRL1_AN_EN;
+ val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_RX;
+ val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_TX;
+ /* Only full-duplex supported for now. */
+ val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_DPX;
+
+ regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL1, val);
+
+ if (priv->pause) {
+ val = MTK_STAR_VAL_FC_CFG_SEND_PAUSE_TH_2K;
+ val <<= MTK_STAR_OFF_FC_CFG_SEND_PAUSE_TH;
+ val |= MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR;
+ } else {
+ val = 0;
+ }
+
+ regmap_update_bits(priv->regs, MTK_STAR_REG_FC_CFG,
+ MTK_STAR_MSK_FC_CFG_SEND_PAUSE_TH |
+ MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR, val);
+
+ if (priv->pause) {
+ val = MTK_STAR_VAL_EXT_CFG_SND_PAUSE_RLS_1K;
+ val <<= MTK_STAR_OFF_EXT_CFG_SND_PAUSE_RLS;
+ } else {
+ val = 0;
+ }
+
+ regmap_update_bits(priv->regs, MTK_STAR_REG_EXT_CFG,
+ MTK_STAR_MSK_EXT_CFG_SND_PAUSE_RLS, val);
+}
+
+static void mtk_star_adjust_link(struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+ struct phy_device *phydev = priv->phydev;
+ bool new_state = false;
+
+ if (phydev->link) {
+ if (!priv->link) {
+ priv->link = phydev->link;
+ new_state = true;
+ }
+
+ if (priv->speed != phydev->speed) {
+ priv->speed = phydev->speed;
+ new_state = true;
+ }
+
+ if (priv->pause != phydev->pause) {
+ priv->pause = phydev->pause;
+ new_state = true;
+ }
+ } else {
+ if (priv->link) {
+ priv->link = phydev->link;
+ new_state = true;
+ }
+ }
+
+ if (new_state) {
+ if (phydev->link)
+ mtk_star_phy_config(priv);
+
+ phy_print_status(ndev->phydev);
+ }
+}
+
+static void mtk_star_init_config(struct mtk_star_priv *priv)
+{
+ unsigned int val;
+
+ val = (MTK_STAR_BIT_MII_PAD_OUT_ENABLE |
+ MTK_STAR_BIT_EXT_MDC_MODE |
+ MTK_STAR_BIT_SWC_MII_MODE);
+
+ regmap_write(priv->regs, MTK_STAR_REG_SYS_CONF, val);
+ regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CLK_CONF,
+ MTK_STAR_MSK_MAC_CLK_CONF,
+ MTK_STAR_BIT_CLK_DIV_10);
+}
+
+static void mtk_star_set_mode_rmii(struct mtk_star_priv *priv)
+{
+ regmap_update_bits(priv->pericfg, MTK_PERICFG_REG_NIC_CFG_CON,
+ MTK_PERICFG_MSK_NIC_CFG_CON_CFG_MII,
+ MTK_PERICFG_BIT_NIC_CFG_CON_RMII);
+}
+
+static int mtk_star_enable(struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+ unsigned int val;
+ int ret;
+
+ mtk_star_nic_disable_pd(priv);
+ mtk_star_intr_disable(priv);
+ mtk_star_dma_stop(priv);
+
+ mtk_star_set_mac_addr(ndev);
+
+ /* Configure the MAC */
+ val = MTK_STAR_VAL_MAC_CFG_IPG_96BIT;
+ val <<= MTK_STAR_OFF_MAC_CFG_IPG;
+ val |= MTK_STAR_BIT_MAC_CFG_MAXLEN_1522;
+ val |= MTK_STAR_BIT_MAC_CFG_AUTO_PAD;
+ val |= MTK_STAR_BIT_MAC_CFG_CRC_STRIP;
+ regmap_write(priv->regs, MTK_STAR_REG_MAC_CFG, val);
+
+ /* Enable Hash Table BIST and reset it */
+ ret = mtk_star_reset_hash_table(priv);
+ if (ret)
+ return ret;
+
+ /* Setup the hashing algorithm */
+ regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG,
+ MTK_STAR_BIT_ARL_CFG_HASH_ALG |
+ MTK_STAR_BIT_ARL_CFG_MISC_MODE, 0);
+
+ /* Don't strip VLAN tags */
+ regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG,
+ MTK_STAR_BIT_MAC_CFG_VLAN_STRIP, 0);
+
+ /* Setup DMA */
+ mtk_star_dma_init(priv);
+
+ ret = mtk_star_prepare_rx_skbs(ndev);
+ if (ret)
+ goto err_out;
+
+ /* Request the interrupt */
+ ret = request_irq(ndev->irq, mtk_star_handle_irq,
+ IRQF_TRIGGER_FALLING, ndev->name, ndev);
+ if (ret)
+ goto err_free_skbs;
+
+ napi_enable(&priv->napi);
+
+ mtk_star_intr_ack_all(priv);
+ mtk_star_intr_enable(priv);
+
+ /* Connect to and start PHY */
+ priv->phydev = of_phy_connect(ndev, priv->phy_node,
+ mtk_star_adjust_link, 0, priv->phy_intf);
+ if (!priv->phydev) {
+ netdev_err(ndev, "failed to connect to PHY\n");
+ goto err_free_irq;
+ }
+
+ mtk_star_dma_start(priv);
+ phy_start(priv->phydev);
+ netif_start_queue(ndev);
+
+ return 0;
+
+err_free_irq:
+ free_irq(ndev->irq, ndev);
+err_free_skbs:
+ mtk_star_free_rx_skbs(priv);
+err_out:
+ return ret;
+}
+
+static void mtk_star_disable(struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+
+ netif_stop_queue(ndev);
+ napi_disable(&priv->napi);
+ mtk_star_intr_disable(priv);
+ mtk_star_dma_disable(priv);
+ mtk_star_intr_ack_all(priv);
+ phy_stop(priv->phydev);
+ phy_disconnect(priv->phydev);
+ free_irq(ndev->irq, ndev);
+ mtk_star_free_rx_skbs(priv);
+ mtk_star_free_tx_skbs(priv);
+}
+
+static int mtk_star_netdev_open(struct net_device *ndev)
+{
+ return mtk_star_enable(ndev);
+}
+
+static int mtk_star_netdev_stop(struct net_device *ndev)
+{
+ mtk_star_disable(ndev);
+
+ return 0;
+}
+
+static int mtk_star_netdev_ioctl(struct net_device *ndev,
+ struct ifreq *req, int cmd)
+{
+ if (!netif_running(ndev))
+ return -EINVAL;
+
+ return phy_mii_ioctl(ndev->phydev, req, cmd);
+}
+
+static int mtk_star_netdev_start_xmit(struct sk_buff *skb,
+ struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+ struct mtk_star_ring *ring = &priv->tx_ring;
+ struct device *dev = mtk_star_get_dev(priv);
+ struct mtk_star_ring_desc_data desc_data;
+
+ desc_data.dma_addr = mtk_star_dma_map_tx(priv, skb);
+ if (dma_mapping_error(dev, desc_data.dma_addr))
+ goto err_drop_packet;
+
+ desc_data.skb = skb;
+ desc_data.len = skb->len;
+
+ spin_lock_bh(&priv->lock);
+
+ mtk_star_ring_push_head_tx(ring, &desc_data);
+
+ netdev_sent_queue(ndev, skb->len);
+
+ if (mtk_star_ring_full(ring))
+ netif_stop_queue(ndev);
+
+ spin_unlock_bh(&priv->lock);
+
+ mtk_star_dma_resume_tx(priv);
+
+ return NETDEV_TX_OK;
+
+err_drop_packet:
+ dev_kfree_skb(skb);
+ ndev->stats.tx_dropped++;
+ return NETDEV_TX_BUSY;
+}
+
+/* Returns the number of bytes sent or a negative number on the first
+ * descriptor owned by DMA.
+ */
+static int mtk_star_tx_complete_one(struct mtk_star_priv *priv)
+{
+ struct mtk_star_ring *ring = &priv->tx_ring;
+ struct mtk_star_ring_desc_data desc_data;
+ int ret;
+
+ ret = mtk_star_ring_pop_tail(ring, &desc_data);
+ if (ret)
+ return ret;
+
+ mtk_star_dma_unmap_tx(priv, &desc_data);
+ ret = desc_data.skb->len;
+ dev_kfree_skb_irq(desc_data.skb);
+
+ return ret;
+}
+
+static void mtk_star_tx_complete_all(struct mtk_star_priv *priv)
+{
+ struct mtk_star_ring *ring = &priv->tx_ring;
+ struct net_device *ndev = priv->ndev;
+ int ret, pkts_compl, bytes_compl;
+ bool wake = false;
+
+ spin_lock(&priv->lock);
+
+ for (pkts_compl = 0, bytes_compl = 0;;
+ pkts_compl++, bytes_compl += ret, wake = true) {
+ if (!mtk_star_ring_descs_available(ring))
+ break;
+
+ ret = mtk_star_tx_complete_one(priv);
+ if (ret < 0)
+ break;
+ }
+
+ netdev_completed_queue(ndev, pkts_compl, bytes_compl);
+
+ if (wake && netif_queue_stopped(ndev))
+ netif_wake_queue(ndev);
+
+ mtk_star_intr_enable_tx(priv);
+
+ spin_unlock(&priv->lock);
+}
+
+static void mtk_star_netdev_get_stats64(struct net_device *ndev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+
+ mtk_star_update_stats(priv);
+
+ memcpy(stats, &priv->stats, sizeof(*stats));
+}
+
+static void mtk_star_set_rx_mode(struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+ struct netdev_hw_addr *hw_addr;
+ unsigned int hash_addr, i;
+ int ret;
+
+ if (ndev->flags & IFF_PROMISC) {
+ regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG,
+ MTK_STAR_BIT_ARL_CFG_MISC_MODE,
+ MTK_STAR_BIT_ARL_CFG_MISC_MODE);
+ } else if (netdev_mc_count(ndev) > MTK_STAR_HASHTABLE_MC_LIMIT ||
+ ndev->flags & IFF_ALLMULTI) {
+ for (i = 0; i < MTK_STAR_HASHTABLE_SIZE_MAX; i++) {
+ ret = mtk_star_set_hashbit(priv, i);
+ if (ret)
+ goto hash_fail;
+ }
+ } else {
+ /* Clear previous settings. */
+ ret = mtk_star_reset_hash_table(priv);
+ if (ret)
+ goto hash_fail;
+
+ netdev_for_each_mc_addr(hw_addr, ndev) {
+ hash_addr = (hw_addr->addr[0] & 0x01) << 8;
+ hash_addr += hw_addr->addr[5];
+ ret = mtk_star_set_hashbit(priv, hash_addr);
+ if (ret)
+ goto hash_fail;
+ }
+ }
+
+ return;
+
+hash_fail:
+ if (ret == -ETIMEDOUT)
+ netdev_err(ndev, "setting hash bit timed out\n");
+ else
+ /* Should be -EIO */
+ netdev_err(ndev, "unable to set hash bit");
+}
+
+static const struct net_device_ops mtk_star_netdev_ops = {
+ .ndo_open = mtk_star_netdev_open,
+ .ndo_stop = mtk_star_netdev_stop,
+ .ndo_start_xmit = mtk_star_netdev_start_xmit,
+ .ndo_get_stats64 = mtk_star_netdev_get_stats64,
+ .ndo_set_rx_mode = mtk_star_set_rx_mode,
+ .ndo_do_ioctl = mtk_star_netdev_ioctl,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
+static void mtk_star_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strlcpy(info->driver, MTK_STAR_DRVNAME, sizeof(info->driver));
+}
+
+/* TODO Add ethtool stats. */
+static const struct ethtool_ops mtk_star_ethtool_ops = {
+ .get_drvinfo = mtk_star_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_link_ksettings = phy_ethtool_get_link_ksettings,
+ .set_link_ksettings = phy_ethtool_set_link_ksettings,
+};
+
+static int mtk_star_receive_packet(struct mtk_star_priv *priv)
+{
+ struct mtk_star_ring *ring = &priv->rx_ring;
+ struct device *dev = mtk_star_get_dev(priv);
+ struct mtk_star_ring_desc_data desc_data;
+ struct net_device *ndev = priv->ndev;
+ struct sk_buff *curr_skb, *new_skb;
+ dma_addr_t new_dma_addr;
+ int ret;
+
+ spin_lock(&priv->lock);
+ ret = mtk_star_ring_pop_tail(ring, &desc_data);
+ spin_unlock(&priv->lock);
+ if (ret)
+ return -1;
+
+ curr_skb = desc_data.skb;
+
+ if ((desc_data.flags & MTK_STAR_DESC_BIT_RX_CRCE) ||
+ (desc_data.flags & MTK_STAR_DESC_BIT_RX_OSIZE)) {
+ /* Error packet -> drop and reuse skb. */
+ new_skb = curr_skb;
+ goto push_new_skb;
+ }
+
+ /* Prepare new skb before receiving the current one. Reuse the current
+ * skb if we fail at any point.
+ */
+ new_skb = mtk_star_alloc_skb(ndev);
+ if (!new_skb) {
+ ndev->stats.rx_dropped++;
+ new_skb = curr_skb;
+ goto push_new_skb;
+ }
+
+ new_dma_addr = mtk_star_dma_map_rx(priv, new_skb);
+ if (dma_mapping_error(dev, new_dma_addr)) {
+ ndev->stats.rx_dropped++;
+ dev_kfree_skb(new_skb);
+ new_skb = curr_skb;
+ netdev_err(ndev, "DMA mapping error of RX descriptor\n");
+ goto push_new_skb;
+ }
+
+ /* We can't fail anymore at this point: it's safe to unmap the skb. */
+ mtk_star_dma_unmap_rx(priv, &desc_data);
+
+ skb_put(desc_data.skb, desc_data.len);
+ desc_data.skb->ip_summed = CHECKSUM_NONE;
+ desc_data.skb->protocol = eth_type_trans(desc_data.skb, ndev);
+ desc_data.skb->dev = ndev;
+ netif_receive_skb(desc_data.skb);
+
+push_new_skb:
+ desc_data.dma_addr = new_dma_addr;
+ desc_data.len = skb_tailroom(new_skb);
+ desc_data.skb = new_skb;
+
+ spin_lock(&priv->lock);
+ mtk_star_ring_push_head_rx(ring, &desc_data);
+ spin_unlock(&priv->lock);
+
+ return 0;
+}
+
+static int mtk_star_process_rx(struct mtk_star_priv *priv, int budget)
+{
+ int received, ret;
+
+ for (received = 0, ret = 0; received < budget && ret == 0; received++)
+ ret = mtk_star_receive_packet(priv);
+
+ mtk_star_dma_resume_rx(priv);
+
+ return received;
+}
+
+static int mtk_star_poll(struct napi_struct *napi, int budget)
+{
+ struct mtk_star_priv *priv;
+ int received = 0;
+
+ priv = container_of(napi, struct mtk_star_priv, napi);
+
+ /* Clean-up all TX descriptors. */
+ mtk_star_tx_complete_all(priv);
+ /* Receive up to $budget packets. */
+ received = mtk_star_process_rx(priv, budget);
+
+ if (received < budget) {
+ napi_complete_done(napi, received);
+ mtk_star_intr_enable_rx(priv);
+ }
+
+ return received;
+}
+
+static void mtk_star_mdio_rwok_clear(struct mtk_star_priv *priv)
+{
+ regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0,
+ MTK_STAR_BIT_PHY_CTRL0_RWOK);
+}
+
+static int mtk_star_mdio_rwok_wait(struct mtk_star_priv *priv)
+{
+ unsigned int val;
+
+ return regmap_read_poll_timeout(priv->regs, MTK_STAR_REG_PHY_CTRL0,
+ val, val & MTK_STAR_BIT_PHY_CTRL0_RWOK,
+ 10, MTK_STAR_WAIT_TIMEOUT);
+}
+
+static int mtk_star_mdio_read(struct mii_bus *mii, int phy_id, int regnum)
+{
+ struct mtk_star_priv *priv = mii->priv;
+ unsigned int val, data;
+ int ret;
+
+ if (regnum & MII_ADDR_C45)
+ return -EOPNOTSUPP;
+
+ mtk_star_mdio_rwok_clear(priv);
+
+ val = (regnum << MTK_STAR_OFF_PHY_CTRL0_PREG);
+ val &= MTK_STAR_MSK_PHY_CTRL0_PREG;
+ val |= MTK_STAR_BIT_PHY_CTRL0_RDCMD;
+
+ regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0, val);
+
+ ret = mtk_star_mdio_rwok_wait(priv);
+ if (ret)
+ return ret;
+
+ regmap_read(priv->regs, MTK_STAR_REG_PHY_CTRL0, &data);
+
+ data &= MTK_STAR_MSK_PHY_CTRL0_RWDATA;
+ data >>= MTK_STAR_OFF_PHY_CTRL0_RWDATA;
+
+ return data;
+}
+
+static int mtk_star_mdio_write(struct mii_bus *mii, int phy_id,
+ int regnum, u16 data)
+{
+ struct mtk_star_priv *priv = mii->priv;
+ unsigned int val;
+
+ if (regnum & MII_ADDR_C45)
+ return -EOPNOTSUPP;
+
+ mtk_star_mdio_rwok_clear(priv);
+
+ val = data;
+ val <<= MTK_STAR_OFF_PHY_CTRL0_RWDATA;
+ val &= MTK_STAR_MSK_PHY_CTRL0_RWDATA;
+ regnum <<= MTK_STAR_OFF_PHY_CTRL0_PREG;
+ regnum &= MTK_STAR_MSK_PHY_CTRL0_PREG;
+ val |= regnum;
+ val |= MTK_STAR_BIT_PHY_CTRL0_WTCMD;
+
+ regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0, val);
+
+ return mtk_star_mdio_rwok_wait(priv);
+}
+
+static int mtk_star_mdio_init(struct net_device *ndev)
+{
+ struct mtk_star_priv *priv = netdev_priv(ndev);
+ struct device *dev = mtk_star_get_dev(priv);
+ struct device_node *of_node, *mdio_node;
+ int ret;
+
+ of_node = dev->of_node;
+
+ mdio_node = of_get_child_by_name(of_node, "mdio");
+ if (!mdio_node)
+ return -ENODEV;
+
+ if (!of_device_is_available(mdio_node)) {
+ ret = -ENODEV;
+ goto out_put_node;
+ }
+
+ priv->mii = devm_mdiobus_alloc(dev);
+ if (!priv->mii) {
+ ret = -ENOMEM;
+ goto out_put_node;
+ }
+
+ snprintf(priv->mii->id, MII_BUS_ID_SIZE, "%s", dev_name(dev));
+ priv->mii->name = "mtk-mac-mdio";
+ priv->mii->parent = dev;
+ priv->mii->read = mtk_star_mdio_read;
+ priv->mii->write = mtk_star_mdio_write;
+ priv->mii->priv = priv;
+
+ ret = of_mdiobus_register(priv->mii, mdio_node);
+
+out_put_node:
+ of_node_put(mdio_node);
+ return ret;
+}
+
+static int mtk_star_suspend(struct device *dev)
+{
+ struct mtk_star_priv *priv;
+ struct net_device *ndev;
+
+ ndev = dev_get_drvdata(dev);
+ priv = netdev_priv(ndev);
+
+ if (netif_running(ndev))
+ mtk_star_disable(ndev);
+
+ clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks);
+
+ return 0;
+}
+
+static int mtk_star_resume(struct device *dev)
+{
+ struct mtk_star_priv *priv;
+ struct net_device *ndev;
+ int ret;
+
+ ndev = dev_get_drvdata(dev);
+ priv = netdev_priv(ndev);
+
+ ret = clk_bulk_prepare_enable(MTK_STAR_NCLKS, priv->clks);
+ if (ret)
+ return ret;
+
+ if (netif_running(ndev)) {
+ ret = mtk_star_enable(ndev);
+ if (ret)
+ clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks);
+ }
+
+ return ret;
+}
+
+static void mtk_star_clk_disable_unprepare(void *data)
+{
+ struct mtk_star_priv *priv = data;
+
+ clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks);
+}
+
+static void mtk_star_mdiobus_unregister(void *data)
+{
+ struct mtk_star_priv *priv = data;
+
+ mdiobus_unregister(priv->mii);
+}
+
+static void mtk_star_unregister_netdev(void *data)
+{
+ struct net_device *ndev = data;
+
+ unregister_netdev(ndev);
+}
+
+static int mtk_star_probe(struct platform_device *pdev)
+{
+ struct device_node *of_node;
+ struct mtk_star_priv *priv;
+ struct net_device *ndev;
+ struct device *dev;
+ void __iomem *base;
+ int ret, i;
+
+ dev = &pdev->dev;
+ of_node = dev->of_node;
+
+ ndev = devm_alloc_etherdev(dev, sizeof(*priv));
+ if (!ndev)
+ return -ENOMEM;
+
+ priv = netdev_priv(ndev);
+ priv->ndev = ndev;
+ SET_NETDEV_DEV(ndev, dev);
+ platform_set_drvdata(pdev, ndev);
+
+ ndev->min_mtu = ETH_ZLEN;
+ ndev->max_mtu = MTK_STAR_MAX_FRAME_SIZE;
+
+ spin_lock_init(&priv->lock);
+ INIT_WORK(&priv->stats_work, mtk_star_update_stats_work);
+
+ base = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(base))
+ return PTR_ERR(base);
+
+ /* We won't be checking the return values of regmap read & write
+ * functions. They can only fail for mmio if there's a clock attached
+ * to regmap which is not the case here.
+ */
+ priv->regs = devm_regmap_init_mmio(dev, base,
+ &mtk_star_regmap_config);
+ if (IS_ERR(priv->regs))
+ return PTR_ERR(priv->regs);
+
+ priv->pericfg = syscon_regmap_lookup_by_phandle(of_node,
+ "mediatek,pericfg");
+ if (IS_ERR(priv->pericfg)) {
+ dev_err(dev, "Failed to lookup the PERICFG syscon\n");
+ return PTR_ERR(priv->pericfg);
+ }
+
+ ndev->irq = platform_get_irq(pdev, 0);
+ if (ndev->irq < 0)
+ return ndev->irq;
+
+ for (i = 0; i < MTK_STAR_NCLKS; i++)
+ priv->clks[i].id = mtk_star_clk_names[i];
+ ret = devm_clk_bulk_get(dev, MTK_STAR_NCLKS, priv->clks);
+ if (ret)
+ return ret;
+
+ ret = clk_bulk_prepare_enable(MTK_STAR_NCLKS, priv->clks);
+ if (ret)
+ return ret;
+
+ ret = devm_add_action_or_reset(dev,
+ mtk_star_clk_disable_unprepare, priv);
+ if (ret)
+ return ret;
+
+ ret = of_get_phy_mode(of_node, &priv->phy_intf);
+ if (ret) {
+ return ret;
+ } else if (priv->phy_intf != PHY_INTERFACE_MODE_RMII) {
+ dev_err(dev, "unsupported phy mode: %s\n",
+ phy_modes(priv->phy_intf));
+ return -EINVAL;
+ }
+
+ priv->phy_node = of_parse_phandle(of_node, "phy-handle", 0);
+ if (!priv->phy_node) {
+ dev_err(dev, "failed to retrieve the phy handle from device tree\n");
+ return -ENODEV;
+ }
+
+ mtk_star_set_mode_rmii(priv);
+
+ ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ if (ret) {
+ dev_err(dev, "unsupported DMA mask\n");
+ return ret;
+ }
+
+ priv->ring_base = dmam_alloc_coherent(dev, MTK_STAR_DMA_SIZE,
+ &priv->dma_addr,
+ GFP_KERNEL | GFP_DMA);
+ if (!priv->ring_base)
+ return -ENOMEM;
+
+ mtk_star_nic_disable_pd(priv);
+ mtk_star_init_config(priv);
+
+ ret = mtk_star_mdio_init(ndev);
+ if (ret)
+ return ret;
+
+ ret = devm_add_action_or_reset(dev, mtk_star_mdiobus_unregister, priv);
+ if (ret)
+ return ret;
+
+ ret = eth_platform_get_mac_address(dev, ndev->dev_addr);
+ if (ret || !is_valid_ether_addr(ndev->dev_addr))
+ eth_hw_addr_random(ndev);
+
+ ndev->netdev_ops = &mtk_star_netdev_ops;
+ ndev->ethtool_ops = &mtk_star_ethtool_ops;
+
+ netif_napi_add(ndev, &priv->napi, mtk_star_poll, MTK_STAR_NAPI_WEIGHT);
+
+ ret = register_netdev(ndev);
+ if (ret)
+ return ret;
+
+ ret = devm_add_action_or_reset(dev, mtk_star_unregister_netdev, ndev);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static const struct of_device_id mtk_star_of_match[] = {
+ { .compatible = "mediatek,mt8516-eth", },
+ { .compatible = "mediatek,mt8518-eth", },
+ { .compatible = "mediatek,mt8175-eth", },
+ { }
+};
+MODULE_DEVICE_TABLE(of, mtk_star_of_match);
+
+static SIMPLE_DEV_PM_OPS(mtk_star_pm_ops,
+ mtk_star_suspend, mtk_star_resume);
+
+static struct platform_driver mtk_star_driver = {
+ .driver = {
+ .name = MTK_STAR_DRVNAME,
+ .pm = &mtk_star_pm_ops,
+ .of_match_table = of_match_ptr(mtk_star_of_match),
+ },
+ .probe = mtk_star_probe,
+};
+module_platform_driver(mtk_star_driver);
+
+MODULE_AUTHOR("Bartosz Golaszewski <bgolaszewski@baylibre.com>");
+MODULE_DESCRIPTION("Mediatek STAR Ethernet MAC Driver");
+MODULE_LICENSE("GPL");
Legacy SRIOV mode (L2 mac vlan steering based).
Switchdev mode (eswitch offloads).
+config MLX5_CLS_ACT
+ bool "MLX5 TC classifier action support"
+ depends on MLX5_ESWITCH && NET_CLS_ACT
+ default y
+ help
+ mlx5 ConnectX offloads support for TC classifier action (NET_CLS_ACT),
+ works in both native NIC mdoe and Switchdev SRIOV mode.
+ Actions get attached to a Hardware offloaded classifiers and are
+ invoked after a successful classification. Actions are used to
+ overwrite the classification result, instantly drop or redirect and/or
+ reformat packets in wire speeds without involving the host cpu.
+
+ If set to N, TC offloads in both NIC and switchdev modes will be disabled.
+ If unsure, set to Y
+
config MLX5_TC_CT
bool "MLX5 TC connection tracking offload support"
- depends on MLX5_CORE_EN && NET_SWITCHDEV && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT
+ depends on MLX5_CLS_ACT && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT
default y
help
Say Y here if you want to support offloading connection tracking rules
mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o
mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o
mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
-mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \
- lib/geneve.o en/mapping.o en/tc_tun_vxlan.o en/tc_tun_gre.o \
- en/tc_tun_geneve.o diag/en_tc_tracepoint.o
mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o lib/geneve.o lib/port_tun.o lag_mp.o
+mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \
+ en/mapping.o esw/chains.o en/tc_tun.o \
+ en/tc_tun_vxlan.o en/tc_tun_gre.o en/tc_tun_geneve.o \
+ en/tc_tun_mplsoudp.o diag/en_tc_tracepoint.o
mlx5_core-$(CONFIG_MLX5_TC_CT) += en/tc_ct.o
#
# Core extra
#
mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \
- ecpf.o rdma.o esw/chains.o
+ ecpf.o rdma.o
mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o
mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
dma_addr_t addr;
union {
struct page *page;
- struct {
- u64 handle;
- void *data;
- } xsk;
+ struct xdp_buff *xsk;
};
};
} mpwqe;
};
struct {
- u16 umem_headroom;
u16 headroom;
u32 frame0_sz;
u8 map_dir; /* dma map direction */
struct page_pool *page_pool;
/* AF_XDP zero-copy */
- struct zero_copy_allocator zca;
struct xdp_umem *umem;
struct work_struct recover_work;
u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
- u16 headroom = NET_IP_ALIGN;
+ u16 headroom;
- if (mlx5e_rx_is_xdp(params, xsk)) {
+ if (xsk)
+ return xsk->headroom;
+
+ headroom = NET_IP_ALIGN;
+ if (mlx5e_rx_is_xdp(params, xsk))
headroom += XDP_PACKET_HEADROOM;
- if (xsk)
- headroom += xsk->headroom;
- } else {
+ else
headroom += MLX5_RX_HEADROOM;
- }
return headroom;
}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+#include <linux/workqueue.h>
+#include <linux/rwlock.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <net/netevent.h>
+#include "neigh.h"
+#include "tc.h"
+#include "en_rep.h"
+#include "fs_core.h"
+#include "diag/en_rep_tracepoint.h"
+
+static unsigned long mlx5e_rep_ipv6_interval(void)
+{
+ if (IS_ENABLED(CONFIG_IPV6) && ipv6_stub->nd_tbl)
+ return NEIGH_VAR(&ipv6_stub->nd_tbl->parms, DELAY_PROBE_TIME);
+
+ return ~0UL;
+}
+
+static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
+{
+ unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME);
+ unsigned long ipv6_interval = mlx5e_rep_ipv6_interval();
+ struct net_device *netdev = rpriv->netdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+
+ rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
+ mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
+}
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+ mlx5_fc_queue_stats_work(priv->mdev,
+ &neigh_update->neigh_stats_work,
+ neigh_update->min_interval);
+}
+
+static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
+{
+ return refcount_inc_not_zero(&nhe->refcnt);
+}
+
+static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe);
+
+void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
+{
+ if (refcount_dec_and_test(&nhe->refcnt)) {
+ mlx5e_rep_neigh_entry_remove(nhe);
+ kfree_rcu(nhe, rcu);
+ }
+}
+
+static struct mlx5e_neigh_hash_entry *
+mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv,
+ struct mlx5e_neigh_hash_entry *nhe)
+{
+ struct mlx5e_neigh_hash_entry *next = NULL;
+
+ rcu_read_lock();
+
+ for (next = nhe ?
+ list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
+ &nhe->neigh_list,
+ struct mlx5e_neigh_hash_entry,
+ neigh_list) :
+ list_first_or_null_rcu(&rpriv->neigh_update.neigh_list,
+ struct mlx5e_neigh_hash_entry,
+ neigh_list);
+ next;
+ next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
+ &next->neigh_list,
+ struct mlx5e_neigh_hash_entry,
+ neigh_list))
+ if (mlx5e_rep_neigh_entry_hold(next))
+ break;
+
+ rcu_read_unlock();
+
+ if (nhe)
+ mlx5e_rep_neigh_entry_release(nhe);
+
+ return next;
+}
+
+static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
+{
+ struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
+ neigh_update.neigh_stats_work.work);
+ struct net_device *netdev = rpriv->netdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct mlx5e_neigh_hash_entry *nhe = NULL;
+
+ rtnl_lock();
+ if (!list_empty(&rpriv->neigh_update.neigh_list))
+ mlx5e_rep_queue_neigh_stats_work(priv);
+
+ while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL)
+ mlx5e_tc_update_neigh_used_value(nhe);
+
+ rtnl_unlock();
+}
+
+static void mlx5e_rep_neigh_update(struct work_struct *work)
+{
+ struct mlx5e_neigh_hash_entry *nhe =
+ container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work);
+ struct neighbour *n = nhe->n;
+ struct mlx5e_encap_entry *e;
+ unsigned char ha[ETH_ALEN];
+ struct mlx5e_priv *priv;
+ bool neigh_connected;
+ u8 nud_state, dead;
+
+ rtnl_lock();
+
+ /* If these parameters are changed after we release the lock,
+ * we'll receive another event letting us know about it.
+ * We use this lock to avoid inconsistency between the neigh validity
+ * and it's hw address.
+ */
+ read_lock_bh(&n->lock);
+ memcpy(ha, n->ha, ETH_ALEN);
+ nud_state = n->nud_state;
+ dead = n->dead;
+ read_unlock_bh(&n->lock);
+
+ neigh_connected = (nud_state & NUD_VALID) && !dead;
+
+ trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected);
+
+ list_for_each_entry(e, &nhe->encap_list, encap_list) {
+ if (!mlx5e_encap_take(e))
+ continue;
+
+ priv = netdev_priv(e->out_dev);
+ mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
+ mlx5e_encap_put(priv, e);
+ }
+ mlx5e_rep_neigh_entry_release(nhe);
+ rtnl_unlock();
+ neigh_release(n);
+}
+
+static void mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv,
+ struct mlx5e_neigh_hash_entry *nhe,
+ struct neighbour *n)
+{
+ /* Take a reference to ensure the neighbour and mlx5 encap
+ * entry won't be destructed until we drop the reference in
+ * delayed work.
+ */
+ neigh_hold(n);
+
+ /* This assignment is valid as long as the the neigh reference
+ * is taken
+ */
+ nhe->n = n;
+
+ if (!queue_work(priv->wq, &nhe->neigh_update_work)) {
+ mlx5e_rep_neigh_entry_release(nhe);
+ neigh_release(n);
+ }
+}
+
+static int mlx5e_rep_netevent_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
+ neigh_update.netevent_nb);
+ struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+ struct net_device *netdev = rpriv->netdev;
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct mlx5e_neigh_hash_entry *nhe = NULL;
+ struct mlx5e_neigh m_neigh = {};
+ struct neigh_parms *p;
+ struct neighbour *n;
+ bool found = false;
+
+ switch (event) {
+ case NETEVENT_NEIGH_UPDATE:
+ n = ptr;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl)
+#else
+ if (n->tbl != &arp_tbl)
+#endif
+ return NOTIFY_DONE;
+
+ m_neigh.dev = n->dev;
+ m_neigh.family = n->ops->family;
+ memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+
+ rcu_read_lock();
+ nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh);
+ rcu_read_unlock();
+ if (!nhe)
+ return NOTIFY_DONE;
+
+ mlx5e_rep_queue_neigh_update_work(priv, nhe, n);
+ break;
+
+ case NETEVENT_DELAY_PROBE_TIME_UPDATE:
+ p = ptr;
+
+ /* We check the device is present since we don't care about
+ * changes in the default table, we only care about changes
+ * done per device delay prob time parameter.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != &arp_tbl))
+#else
+ if (!p->dev || p->tbl != &arp_tbl)
+#endif
+ return NOTIFY_DONE;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(nhe, &neigh_update->neigh_list,
+ neigh_list) {
+ if (p->dev == nhe->m_neigh.dev) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (!found)
+ return NOTIFY_DONE;
+
+ neigh_update->min_interval = min_t(unsigned long,
+ NEIGH_VAR(p, DELAY_PROBE_TIME),
+ neigh_update->min_interval);
+ mlx5_fc_update_sampling_interval(priv->mdev,
+ neigh_update->min_interval);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static const struct rhashtable_params mlx5e_neigh_ht_params = {
+ .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
+ .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
+ .key_len = sizeof(struct mlx5e_neigh),
+ .automatic_shrinking = true,
+};
+
+int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+ int err;
+
+ err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&neigh_update->neigh_list);
+ mutex_init(&neigh_update->encap_lock);
+ INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
+ mlx5e_rep_neigh_stats_work);
+ mlx5e_rep_neigh_update_init_interval(rpriv);
+
+ rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
+ err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
+ if (err)
+ goto out_err;
+ return 0;
+
+out_err:
+ rhashtable_destroy(&neigh_update->neigh_ht);
+ return err;
+}
+
+void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+ unregister_netevent_notifier(&neigh_update->netevent_nb);
+
+ flush_workqueue(priv->wq); /* flush neigh update works */
+
+ cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
+
+ mutex_destroy(&neigh_update->encap_lock);
+ rhashtable_destroy(&neigh_update->neigh_ht);
+}
+
+static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv,
+ struct mlx5e_neigh_hash_entry *nhe)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ int err;
+
+ err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht,
+ &nhe->rhash_node,
+ mlx5e_neigh_ht_params);
+ if (err)
+ return err;
+
+ list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list);
+
+ return err;
+}
+
+static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe)
+{
+ struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv;
+
+ mutex_lock(&rpriv->neigh_update.encap_lock);
+
+ list_del_rcu(&nhe->neigh_list);
+
+ rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
+ &nhe->rhash_node,
+ mlx5e_neigh_ht_params);
+ mutex_unlock(&rpriv->neigh_update.encap_lock);
+}
+
+/* This function must only be called under the representor's encap_lock or
+ * inside rcu read lock section.
+ */
+struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+ struct mlx5e_neigh *m_neigh)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+ struct mlx5e_neigh_hash_entry *nhe;
+
+ nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh,
+ mlx5e_neigh_ht_params);
+ return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL;
+}
+
+int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e,
+ struct mlx5e_neigh_hash_entry **nhe)
+{
+ int err;
+
+ *nhe = kzalloc(sizeof(**nhe), GFP_KERNEL);
+ if (!*nhe)
+ return -ENOMEM;
+
+ (*nhe)->priv = priv;
+ memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh));
+ INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update);
+ spin_lock_init(&(*nhe)->encap_list_lock);
+ INIT_LIST_HEAD(&(*nhe)->encap_list);
+ refcount_set(&(*nhe)->refcnt, 1);
+
+ err = mlx5e_rep_neigh_entry_insert(priv, *nhe);
+ if (err)
+ goto out_free;
+ return 0;
+
+out_free:
+ kfree(*nhe);
+ return err;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_REP_NEIGH__
+#define __MLX5_EN_REP_NEIGH__
+
+#include "en.h"
+#include "en_rep.h"
+
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
+int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv);
+void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv);
+
+struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+ struct mlx5e_neigh *m_neigh);
+int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e,
+ struct mlx5e_neigh_hash_entry **nhe);
+void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe);
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
+
+#else /* CONFIG_MLX5_CLS_ACT */
+
+static inline int
+mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) { return 0; }
+static inline void
+mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) {}
+
+#endif /* CONFIG_MLX5_CLS_ACT */
+
+#endif /* __MLX5_EN_REP_NEIGH__ */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#include <net/dst_metadata.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include "tc.h"
+#include "neigh.h"
+#include "en_rep.h"
+#include "eswitch.h"
+#include "esw/chains.h"
+#include "en/tc_ct.h"
+#include "en/mapping.h"
+#include "en/tc_tun.h"
+#include "lib/port_tun.h"
+
+struct mlx5e_rep_indr_block_priv {
+ struct net_device *netdev;
+ struct mlx5e_rep_priv *rpriv;
+
+ struct list_head list;
+};
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+ struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
+ struct mlx5e_neigh_hash_entry *nhe;
+ int err;
+
+ err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type);
+ if (err)
+ return err;
+
+ mutex_lock(&rpriv->neigh_update.encap_lock);
+ nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
+ if (!nhe) {
+ err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
+ if (err) {
+ mutex_unlock(&rpriv->neigh_update.encap_lock);
+ mlx5_tun_entropy_refcount_dec(tun_entropy,
+ e->reformat_type);
+ return err;
+ }
+ }
+
+ e->nhe = nhe;
+ spin_lock(&nhe->encap_list_lock);
+ list_add_rcu(&e->encap_list, &nhe->encap_list);
+ spin_unlock(&nhe->encap_list_lock);
+
+ mutex_unlock(&rpriv->neigh_update.encap_lock);
+
+ return 0;
+}
+
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+ struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
+
+ if (!e->nhe)
+ return;
+
+ spin_lock(&e->nhe->encap_list_lock);
+ list_del_rcu(&e->encap_list);
+ spin_unlock(&e->nhe->encap_list_lock);
+
+ mlx5e_rep_neigh_entry_release(e->nhe);
+ e->nhe = NULL;
+ mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type);
+}
+
+void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e,
+ bool neigh_connected,
+ unsigned char ha[ETH_ALEN])
+{
+ struct ethhdr *eth = (struct ethhdr *)e->encap_header;
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ bool encap_connected;
+ LIST_HEAD(flow_list);
+
+ ASSERT_RTNL();
+
+ /* wait for encap to be fully initialized */
+ wait_for_completion(&e->res_ready);
+
+ mutex_lock(&esw->offloads.encap_tbl_lock);
+ encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
+ if (e->compl_result < 0 || (encap_connected == neigh_connected &&
+ ether_addr_equal(e->h_dest, ha)))
+ goto unlock;
+
+ mlx5e_take_all_encap_flows(e, &flow_list);
+
+ if ((e->flags & MLX5_ENCAP_ENTRY_VALID) &&
+ (!neigh_connected || !ether_addr_equal(e->h_dest, ha)))
+ mlx5e_tc_encap_flows_del(priv, e, &flow_list);
+
+ if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
+ ether_addr_copy(e->h_dest, ha);
+ ether_addr_copy(eth->h_dest, ha);
+ /* Update the encap source mac, in case that we delete
+ * the flows when encap source mac changed.
+ */
+ ether_addr_copy(eth->h_source, e->route_dev->dev_addr);
+
+ mlx5e_tc_encap_flows_add(priv, e, &flow_list);
+ }
+unlock:
+ mutex_unlock(&esw->offloads.encap_tbl_lock);
+ mlx5e_put_encap_flow_list(priv, &flow_list);
+}
+
+static int
+mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
+ struct flow_cls_offload *cls_flower, int flags)
+{
+ switch (cls_flower->command) {
+ case FLOW_CLS_REPLACE:
+ return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
+ flags);
+ case FLOW_CLS_DESTROY:
+ return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
+ flags);
+ case FLOW_CLS_STATS:
+ return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
+ flags);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static
+int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv,
+ struct tc_cls_matchall_offload *ma)
+{
+ switch (ma->command) {
+ case TC_CLSMATCHALL_REPLACE:
+ return mlx5e_tc_configure_matchall(priv, ma);
+ case TC_CLSMATCHALL_DESTROY:
+ return mlx5e_tc_delete_matchall(priv, ma);
+ case TC_CLSMATCHALL_STATS:
+ mlx5e_tc_stats_matchall(priv, ma);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv)
+{
+ unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
+ struct mlx5e_priv *priv = cb_priv;
+
+ switch (type) {
+ case TC_SETUP_CLSFLOWER:
+ return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags);
+ case TC_SETUP_CLSMATCHALL:
+ return mlx5e_rep_setup_tc_cls_matchall(priv, type_data);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv)
+{
+ struct flow_cls_offload tmp, *f = type_data;
+ struct mlx5e_priv *priv = cb_priv;
+ struct mlx5_eswitch *esw;
+ unsigned long flags;
+ int err;
+
+ flags = MLX5_TC_FLAG(INGRESS) |
+ MLX5_TC_FLAG(ESW_OFFLOAD) |
+ MLX5_TC_FLAG(FT_OFFLOAD);
+ esw = priv->mdev->priv.eswitch;
+
+ switch (type) {
+ case TC_SETUP_CLSFLOWER:
+ memcpy(&tmp, f, sizeof(*f));
+
+ if (!mlx5_esw_chains_prios_supported(esw))
+ return -EOPNOTSUPP;
+
+ /* Re-use tc offload path by moving the ft flow to the
+ * reserved ft chain.
+ *
+ * FT offload can use prio range [0, INT_MAX], so we normalize
+ * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
+ * as with tc, where prio 0 isn't supported.
+ *
+ * We only support chain 0 of FT offload.
+ */
+ if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw))
+ return -EOPNOTSUPP;
+ if (tmp.common.chain_index != 0)
+ return -EOPNOTSUPP;
+
+ tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
+ tmp.common.prio++;
+ err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags);
+ memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
+ return err;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static LIST_HEAD(mlx5e_rep_block_tc_cb_list);
+static LIST_HEAD(mlx5e_rep_block_ft_cb_list);
+int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct flow_block_offload *f = type_data;
+
+ f->unlocked_driver_cb = true;
+
+ switch (type) {
+ case TC_SETUP_BLOCK:
+ return flow_block_cb_setup_simple(type_data,
+ &mlx5e_rep_block_tc_cb_list,
+ mlx5e_rep_setup_tc_cb,
+ priv, priv, true);
+ case TC_SETUP_FT:
+ return flow_block_cb_setup_simple(type_data,
+ &mlx5e_rep_block_ft_cb_list,
+ mlx5e_rep_setup_ft_cb,
+ priv, priv, true);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+ int err;
+
+ mutex_init(&uplink_priv->unready_flows_lock);
+ INIT_LIST_HEAD(&uplink_priv->unready_flows);
+
+ /* init shared tc flow table */
+ err = mlx5e_tc_esw_init(&uplink_priv->tc_ht);
+ return err;
+}
+
+void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+ /* delete shared tc flow table */
+ mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht);
+ mutex_destroy(&rpriv->uplink_priv.unready_flows_lock);
+}
+
+void mlx5e_rep_tc_enable(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+ INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work,
+ mlx5e_tc_reoffload_flows_work);
+}
+
+void mlx5e_rep_tc_disable(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+ cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
+}
+
+int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+ queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work);
+
+ return NOTIFY_OK;
+}
+
+static struct mlx5e_rep_indr_block_priv *
+mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv,
+ struct net_device *netdev)
+{
+ struct mlx5e_rep_indr_block_priv *cb_priv;
+
+ /* All callback list access should be protected by RTNL. */
+ ASSERT_RTNL();
+
+ list_for_each_entry(cb_priv,
+ &rpriv->uplink_priv.tc_indr_block_priv_list,
+ list)
+ if (cb_priv->netdev == netdev)
+ return cb_priv;
+
+ return NULL;
+}
+
+static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
+ struct net_device *netdev);
+
+void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5e_rep_indr_block_priv *cb_priv, *temp;
+ struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list;
+
+ list_for_each_entry_safe(cb_priv, temp, head, list) {
+ mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev);
+ kfree(cb_priv);
+ }
+}
+
+static int
+mlx5e_rep_indr_offload(struct net_device *netdev,
+ struct flow_cls_offload *flower,
+ struct mlx5e_rep_indr_block_priv *indr_priv,
+ unsigned long flags)
+{
+ struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev);
+ int err = 0;
+
+ switch (flower->command) {
+ case FLOW_CLS_REPLACE:
+ err = mlx5e_configure_flower(netdev, priv, flower, flags);
+ break;
+ case FLOW_CLS_DESTROY:
+ err = mlx5e_delete_flower(netdev, priv, flower, flags);
+ break;
+ case FLOW_CLS_STATS:
+ err = mlx5e_stats_flower(netdev, priv, flower, flags);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type,
+ void *type_data, void *indr_priv)
+{
+ unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
+ struct mlx5e_rep_indr_block_priv *priv = indr_priv;
+
+ switch (type) {
+ case TC_SETUP_CLSFLOWER:
+ return mlx5e_rep_indr_offload(priv->netdev, type_data, priv,
+ flags);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type,
+ void *type_data, void *indr_priv)
+{
+ struct mlx5e_rep_indr_block_priv *priv = indr_priv;
+ struct flow_cls_offload *f = type_data;
+ struct flow_cls_offload tmp;
+ struct mlx5e_priv *mpriv;
+ struct mlx5_eswitch *esw;
+ unsigned long flags;
+ int err;
+
+ mpriv = netdev_priv(priv->rpriv->netdev);
+ esw = mpriv->mdev->priv.eswitch;
+
+ flags = MLX5_TC_FLAG(EGRESS) |
+ MLX5_TC_FLAG(ESW_OFFLOAD) |
+ MLX5_TC_FLAG(FT_OFFLOAD);
+
+ switch (type) {
+ case TC_SETUP_CLSFLOWER:
+ memcpy(&tmp, f, sizeof(*f));
+
+ /* Re-use tc offload path by moving the ft flow to the
+ * reserved ft chain.
+ *
+ * FT offload can use prio range [0, INT_MAX], so we normalize
+ * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
+ * as with tc, where prio 0 isn't supported.
+ *
+ * We only support chain 0 of FT offload.
+ */
+ if (!mlx5_esw_chains_prios_supported(esw) ||
+ tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw) ||
+ tmp.common.chain_index)
+ return -EOPNOTSUPP;
+
+ tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
+ tmp.common.prio++;
+ err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags);
+ memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
+ return err;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void mlx5e_rep_indr_block_unbind(void *cb_priv)
+{
+ struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv;
+
+ list_del(&indr_priv->list);
+ kfree(indr_priv);
+}
+
+static LIST_HEAD(mlx5e_block_cb_list);
+
+static int
+mlx5e_rep_indr_setup_block(struct net_device *netdev,
+ struct mlx5e_rep_priv *rpriv,
+ struct flow_block_offload *f,
+ flow_setup_cb_t *setup_cb)
+{
+ struct mlx5e_rep_indr_block_priv *indr_priv;
+ struct flow_block_cb *block_cb;
+
+ if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ return -EOPNOTSUPP;
+
+ f->unlocked_driver_cb = true;
+ f->driver_block_list = &mlx5e_block_cb_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
+ if (indr_priv)
+ return -EEXIST;
+
+ indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL);
+ if (!indr_priv)
+ return -ENOMEM;
+
+ indr_priv->netdev = netdev;
+ indr_priv->rpriv = rpriv;
+ list_add(&indr_priv->list,
+ &rpriv->uplink_priv.tc_indr_block_priv_list);
+
+ block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv,
+ mlx5e_rep_indr_block_unbind);
+ if (IS_ERR(block_cb)) {
+ list_del(&indr_priv->list);
+ kfree(indr_priv);
+ return PTR_ERR(block_cb);
+ }
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list);
+
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
+ if (!indr_priv)
+ return -ENOENT;
+
+ block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static
+int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv,
+ enum tc_setup_type type, void *type_data)
+{
+ switch (type) {
+ case TC_SETUP_BLOCK:
+ return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
+ mlx5e_rep_indr_setup_tc_cb);
+ case TC_SETUP_FT:
+ return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
+ mlx5e_rep_indr_setup_ft_cb);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
+ struct net_device *netdev)
+{
+ int err;
+
+ err = __flow_indr_block_cb_register(netdev, rpriv,
+ mlx5e_rep_indr_setup_cb,
+ rpriv);
+ if (err) {
+ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+ mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n",
+ netdev_name(netdev), err);
+ }
+ return err;
+}
+
+static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
+ struct net_device *netdev)
+{
+ __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb,
+ rpriv);
+}
+
+static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
+ uplink_priv.netdevice_nb);
+ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ if (!mlx5e_tc_tun_device_to_offload(priv, netdev) &&
+ !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev))
+ return NOTIFY_OK;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ mlx5e_rep_indr_register_block(rpriv, netdev);
+ break;
+ case NETDEV_UNREGISTER:
+ mlx5e_rep_indr_unregister_block(rpriv, netdev);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+ int err;
+
+ /* init indirect block notifications */
+ INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list);
+
+ uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event;
+ err = register_netdevice_notifier_dev_net(rpriv->netdev,
+ &uplink_priv->netdevice_nb,
+ &uplink_priv->netdevice_nn);
+ return err;
+}
+
+void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv)
+{
+ struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+
+ /* clean indirect TC block notifications */
+ unregister_netdevice_notifier_dev_net(rpriv->netdev,
+ &uplink_priv->netdevice_nb,
+ &uplink_priv->netdevice_nn);
+}
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb,
+ struct mlx5e_tc_update_priv *tc_priv,
+ u32 tunnel_id)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct tunnel_match_enc_opts enc_opts = {};
+ struct mlx5_rep_uplink_priv *uplink_priv;
+ struct mlx5e_rep_priv *uplink_rpriv;
+ struct metadata_dst *tun_dst;
+ struct tunnel_match_key key;
+ u32 tun_id, enc_opts_id;
+ struct net_device *dev;
+ int err;
+
+ enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
+ tun_id = tunnel_id >> ENC_OPTS_BITS;
+
+ if (!tun_id)
+ return true;
+
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+ uplink_priv = &uplink_rpriv->uplink_priv;
+
+ err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key);
+ if (err) {
+ WARN_ON_ONCE(true);
+ netdev_dbg(priv->netdev,
+ "Couldn't find tunnel for tun_id: %d, err: %d\n",
+ tun_id, err);
+ return false;
+ }
+
+ if (enc_opts_id) {
+ err = mapping_find(uplink_priv->tunnel_enc_opts_mapping,
+ enc_opts_id, &enc_opts);
+ if (err) {
+ netdev_dbg(priv->netdev,
+ "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n",
+ enc_opts_id, err);
+ return false;
+ }
+ }
+
+ tun_dst = tun_rx_dst(enc_opts.key.len);
+ if (!tun_dst) {
+ WARN_ON_ONCE(true);
+ return false;
+ }
+
+ ip_tunnel_key_init(&tun_dst->u.tun_info.key,
+ key.enc_ipv4.src, key.enc_ipv4.dst,
+ key.enc_ip.tos, key.enc_ip.ttl,
+ 0, /* label */
+ key.enc_tp.src, key.enc_tp.dst,
+ key32_to_tunnel_id(key.enc_key_id.keyid),
+ TUNNEL_KEY);
+
+ if (enc_opts.key.len)
+ ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
+ enc_opts.key.data,
+ enc_opts.key.len,
+ enc_opts.key.dst_opt_type);
+
+ skb_dst_set(skb, (struct dst_entry *)tun_dst);
+ dev = dev_get_by_index(&init_net, key.filter_ifindex);
+ if (!dev) {
+ netdev_dbg(priv->netdev,
+ "Couldn't find tunnel device with ifindex: %d\n",
+ key.filter_ifindex);
+ return false;
+ }
+
+ /* Set tun_dev so we do dev_put() after datapath */
+ tc_priv->tun_dev = dev;
+
+ skb->dev = dev;
+
+ return true;
+}
+#endif /* CONFIG_NET_TC_SKB_EXT */
+
+bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
+ struct sk_buff *skb,
+ struct mlx5e_tc_update_priv *tc_priv)
+{
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ u32 chain = 0, reg_c0, reg_c1, tunnel_id, tuple_id;
+ struct mlx5_rep_uplink_priv *uplink_priv;
+ struct mlx5e_rep_priv *uplink_rpriv;
+ struct tc_skb_ext *tc_skb_ext;
+ struct mlx5_eswitch *esw;
+ struct mlx5e_priv *priv;
+ int tunnel_moffset;
+ int err;
+
+ reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
+ if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG)
+ reg_c0 = 0;
+ reg_c1 = be32_to_cpu(cqe->ft_metadata);
+
+ if (!reg_c0)
+ return true;
+
+ priv = netdev_priv(skb->dev);
+ esw = priv->mdev->priv.eswitch;
+
+ err = mlx5_eswitch_get_chain_for_tag(esw, reg_c0, &chain);
+ if (err) {
+ netdev_dbg(priv->netdev,
+ "Couldn't find chain for chain tag: %d, err: %d\n",
+ reg_c0, err);
+ return false;
+ }
+
+ if (chain) {
+ tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
+ if (!tc_skb_ext) {
+ WARN_ON(1);
+ return false;
+ }
+
+ tc_skb_ext->chain = chain;
+
+ tuple_id = reg_c1 & TUPLE_ID_MAX;
+
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+ uplink_priv = &uplink_rpriv->uplink_priv;
+ if (!mlx5e_tc_ct_restore_flow(uplink_priv, skb, tuple_id))
+ return false;
+ }
+
+ tunnel_moffset = mlx5e_tc_attr_to_reg_mappings[TUNNEL_TO_REG].moffset;
+ tunnel_id = reg_c1 >> (8 * tunnel_moffset);
+ return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
+#endif /* CONFIG_NET_TC_SKB_EXT */
+
+ return true;
+}
+
+void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv)
+{
+ if (tc_priv->tun_dev)
+ dev_put(tc_priv->tun_dev);
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_REP_TC_H__
+#define __MLX5_EN_REP_TC_H__
+
+#include <linux/skbuff.h>
+#include "en_tc.h"
+#include "en_rep.h"
+
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
+int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv);
+void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv);
+
+int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv);
+void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv);
+
+void mlx5e_rep_tc_enable(struct mlx5e_priv *priv);
+void mlx5e_rep_tc_disable(struct mlx5e_priv *priv);
+
+int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv);
+
+void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e,
+ bool neigh_connected,
+ unsigned char ha[ETH_ALEN]);
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e);
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e);
+
+int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data);
+void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv);
+
+bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
+ struct sk_buff *skb,
+ struct mlx5e_tc_update_priv *tc_priv);
+void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv);
+
+#else /* CONFIG_MLX5_CLS_ACT */
+
+struct mlx5e_rep_priv;
+static inline int
+mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv) { return 0; }
+static inline void
+mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv) {}
+
+static inline int
+mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) { return 0; }
+static inline void
+mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) {}
+
+static inline void
+mlx5e_rep_tc_enable(struct mlx5e_priv *priv) {}
+static inline void
+mlx5e_rep_tc_disable(struct mlx5e_priv *priv) {}
+
+static inline int
+mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv) { return NOTIFY_DONE; }
+
+static inline int
+mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data) { return -EOPNOTSUPP; }
+
+static inline void
+mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) {}
+
+struct mlx5e_tc_update_priv;
+static inline bool
+mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
+ struct sk_buff *skb,
+ struct mlx5e_tc_update_priv *tc_priv) { return true; }
+static inline void
+mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv) {}
+
+#endif /* CONFIG_MLX5_CLS_ACT */
+
+#endif /* __MLX5_EN_REP_TC_H__ */
#include <net/vxlan.h>
#include <net/gre.h>
#include <net/geneve.h>
+#include <net/bareudp.h>
#include "en/tc_tun.h"
#include "en_tc.h"
+#include "rep/tc.h"
+#include "rep/neigh.h"
struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev)
{
else if (netif_is_gretap(tunnel_dev) ||
netif_is_ip6gretap(tunnel_dev))
return &gre_tunnel;
+ else if (netif_is_bareudp(tunnel_dev))
+ return &mplsoudp_tunnel;
else
return NULL;
}
}
rt = ip_route_output_key(dev_net(mirred_dev), fl4);
- ret = PTR_ERR_OR_ZERO(rt);
- if (ret)
- return ret;
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) {
ip_rt_put(rt);
MLX5E_TC_TUNNEL_TYPE_VXLAN,
MLX5E_TC_TUNNEL_TYPE_GENEVE,
MLX5E_TC_TUNNEL_TYPE_GRETAP,
+ MLX5E_TC_TUNNEL_TYPE_MPLSOUDP,
};
struct mlx5e_tc_tunnel {
extern struct mlx5e_tc_tunnel vxlan_tunnel;
extern struct mlx5e_tc_tunnel geneve_tunnel;
extern struct mlx5e_tc_tunnel gre_tunnel;
+extern struct mlx5e_tc_tunnel mplsoudp_tunnel;
struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2018 Mellanox Technologies. */
+
+#include <net/bareudp.h>
+#include <net/mpls.h>
+#include "en/tc_tun.h"
+
+static bool can_offload(struct mlx5e_priv *priv)
+{
+ return MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, reformat_l3_tunnel_to_l2);
+}
+
+static int calc_hlen(struct mlx5e_encap_entry *e)
+{
+ return sizeof(struct udphdr) + MPLS_HLEN;
+}
+
+static int init_encap_attr(struct net_device *tunnel_dev,
+ struct mlx5e_priv *priv,
+ struct mlx5e_encap_entry *e,
+ struct netlink_ext_ack *extack)
+{
+ e->tunnel = &mplsoudp_tunnel;
+ e->reformat_type = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
+ return 0;
+}
+
+static inline __be32 mpls_label_id_field(__be32 label, u8 tos, u8 ttl)
+{
+ u32 res;
+
+ /* mpls label is 32 bits long and construction as follows:
+ * 20 bits label
+ * 3 bits tos
+ * 1 bit bottom of stack. Since we support only one label, this bit is
+ * always set.
+ * 8 bits TTL
+ */
+ res = be32_to_cpu(label) << 12 | 1 << 8 | (tos & 7) << 9 | ttl;
+ return cpu_to_be32(res);
+}
+
+static int generate_ip_tun_hdr(char buf[],
+ __u8 *ip_proto,
+ struct mlx5e_encap_entry *r)
+{
+ const struct ip_tunnel_key *tun_key = &r->tun_info->key;
+ __be32 tun_id = tunnel_id_to_key32(tun_key->tun_id);
+ struct udphdr *udp = (struct udphdr *)(buf);
+ struct mpls_shim_hdr *mpls;
+
+ mpls = (struct mpls_shim_hdr *)(udp + 1);
+ *ip_proto = IPPROTO_UDP;
+
+ udp->dest = tun_key->tp_dst;
+ mpls->label_stack_entry = mpls_label_id_field(tun_id, tun_key->tos, tun_key->ttl);
+
+ return 0;
+}
+
+static int parse_udp_ports(struct mlx5e_priv *priv,
+ struct mlx5_flow_spec *spec,
+ struct flow_cls_offload *f,
+ void *headers_c,
+ void *headers_v)
+{
+ return mlx5e_tc_tun_parse_udp_ports(priv, spec, f, headers_c, headers_v);
+}
+
+static int parse_tunnel(struct mlx5e_priv *priv,
+ struct mlx5_flow_spec *spec,
+ struct flow_cls_offload *f,
+ void *headers_c,
+ void *headers_v)
+{
+ struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+ struct flow_match_enc_keyid enc_keyid;
+ struct flow_match_mpls match;
+ void *misc2_c;
+ void *misc2_v;
+
+ misc2_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+ misc_parameters_2);
+ misc2_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+ misc_parameters_2);
+
+ if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS))
+ return 0;
+
+ if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID))
+ return 0;
+
+ flow_rule_match_enc_keyid(rule, &enc_keyid);
+
+ if (!enc_keyid.mask->keyid)
+ return 0;
+
+ if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+ MLX5_FLEX_PROTO_CW_MPLS_UDP))
+ return -EOPNOTSUPP;
+
+ flow_rule_match_mpls(rule, &match);
+
+ MLX5_SET(fte_match_set_misc2, misc2_c,
+ outer_first_mpls_over_udp.mpls_label, match.mask->mpls_label);
+ MLX5_SET(fte_match_set_misc2, misc2_v,
+ outer_first_mpls_over_udp.mpls_label, match.key->mpls_label);
+
+ MLX5_SET(fte_match_set_misc2, misc2_c,
+ outer_first_mpls_over_udp.mpls_exp, match.mask->mpls_tc);
+ MLX5_SET(fte_match_set_misc2, misc2_v,
+ outer_first_mpls_over_udp.mpls_exp, match.key->mpls_tc);
+
+ MLX5_SET(fte_match_set_misc2, misc2_c,
+ outer_first_mpls_over_udp.mpls_s_bos, match.mask->mpls_bos);
+ MLX5_SET(fte_match_set_misc2, misc2_v,
+ outer_first_mpls_over_udp.mpls_s_bos, match.key->mpls_bos);
+
+ MLX5_SET(fte_match_set_misc2, misc2_c,
+ outer_first_mpls_over_udp.mpls_ttl, match.mask->mpls_ttl);
+ MLX5_SET(fte_match_set_misc2, misc2_v,
+ outer_first_mpls_over_udp.mpls_ttl, match.key->mpls_ttl);
+ spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2;
+
+ return 0;
+}
+
+struct mlx5e_tc_tunnel mplsoudp_tunnel = {
+ .tunnel_type = MLX5E_TC_TUNNEL_TYPE_MPLSOUDP,
+ .match_level = MLX5_MATCH_L4,
+ .can_offload = can_offload,
+ .calc_hlen = calc_hlen,
+ .init_encap_attr = init_encap_attr,
+ .generate_ip_tun_hdr = generate_ip_tun_hdr,
+ .parse_udp_ports = parse_udp_ports,
+ .parse_tunnel = parse_tunnel,
+};
*/
#include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "en/xdp.h"
#include "en/params.h"
xdptxd.data = xdpf->data;
xdptxd.len = xdpf->len;
- if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) {
+ if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
/* The xdp_buff was in the UMEM and was copied into a newly
* allocated page. The UMEM page was returned via the ZCA, and
* this new page has to be mapped at this point and has to be
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
- void *va, u16 *rx_headroom, u32 *len, bool xsk)
+ u32 *len, struct xdp_buff *xdp)
{
struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
- struct xdp_umem *umem = rq->umem;
- struct xdp_buff xdp;
u32 act;
int err;
if (!prog)
return false;
- xdp.data = va + *rx_headroom;
- xdp_set_data_meta_invalid(&xdp);
- xdp.data_end = xdp.data + *len;
- xdp.data_hard_start = va;
- if (xsk)
- xdp.handle = di->xsk.handle;
- xdp.rxq = &rq->xdp_rxq;
- xdp.frame_sz = rq->buff.frame0_sz;
-
- act = bpf_prog_run_xdp(prog, &xdp);
- if (xsk) {
- u64 off = xdp.data - xdp.data_hard_start;
-
- xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
- }
+ act = bpf_prog_run_xdp(prog, xdp);
switch (act) {
case XDP_PASS:
- *rx_headroom = xdp.data - xdp.data_hard_start;
- *len = xdp.data_end - xdp.data;
+ *len = xdp->data_end - xdp->data;
return false;
case XDP_TX:
- if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp)))
+ if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp)))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
return true;
case XDP_REDIRECT:
/* When XDP enabled then page-refcnt==1 here */
- err = xdp_do_redirect(rq->netdev, &xdp, prog);
+ err = xdp_do_redirect(rq->netdev, xdp, prog);
if (unlikely(err))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
- if (!xsk)
+ if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
mlx5e_page_dma_unmap(rq, di);
rq->stats->xdp_redirect++;
return true;
struct mlx5e_xsk_param;
int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
- void *va, u16 *rx_headroom, u32 *len, bool xsk);
+ u32 *len, struct xdp_buff *xdp);
void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
#include "rx.h"
#include "en/xdp.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* RX data path */
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
-{
- /* Check in advance that we have enough frames, instead of allocating
- * one-by-one, failing and moving frames to the Reuse Ring.
- */
- return xsk_umem_has_addrs_rq(rq->umem, count);
-}
-
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info)
-{
- struct xdp_umem *umem = rq->umem;
- u64 handle;
-
- if (!xsk_umem_peek_addr_rq(umem, &handle))
- return -ENOMEM;
-
- dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
- rq->buff.umem_headroom);
- dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
-
- /* No need to add headroom to the DMA address. In striding RQ case, we
- * just provide pages for UMR, and headroom is counted at the setup
- * stage when creating a WQE. In non-striding RQ case, headroom is
- * accounted in mlx5e_alloc_rx_wqe.
- */
- dma_info->addr = xdp_umem_get_dma(umem, handle);
-
- xsk_umem_release_addr_rq(umem);
-
- dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
-
- return 0;
-}
-
-static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
-{
- xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
-}
-
-/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
- * the userspace if possible, and if not, this function is called to reuse them
- * in the driver.
- */
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info)
-{
- mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
-}
-
-/* Return a frame back to the hardware to fill in again. It is used by XDP when
- * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
- */
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
-{
- struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
-
- mlx5e_xsk_recycle_frame(rq, handle);
-}
-
static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
u32 cqe_bcnt)
{
u32 head_offset,
u32 page_idx)
{
- struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
- u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
+ struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk;
u32 cqe_bcnt32 = cqe_bcnt;
- void *va, *data;
- u32 frag_size;
bool consumed;
/* Check packet size. Note LRO doesn't use linear SKB */
return NULL;
}
- /* head_offset is not used in this function, because di->xsk.data and
- * di->addr point directly to the necessary place. Furthermore, in the
- * current implementation, UMR pages are mapped to XSK frames, so
+ /* head_offset is not used in this function, because xdp->data and the
+ * DMA address point directly to the necessary place. Furthermore, in
+ * the current implementation, UMR pages are mapped to XSK frames, so
* head_offset should always be 0.
*/
WARN_ON_ONCE(head_offset);
- va = di->xsk.data;
- data = va + rx_headroom;
- frag_size = rq->buff.headroom + cqe_bcnt32;
-
- dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
- prefetch(data);
+ xdp->data_end = xdp->data + cqe_bcnt32;
+ xdp_set_data_meta_invalid(xdp);
+ xsk_buff_dma_sync_for_cpu(xdp);
+ prefetch(xdp->data);
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
+ consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp);
rcu_read_unlock();
/* Possible flows:
/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
* frame. On SKB allocation failure, NULL is returned.
*/
- return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
+ return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32);
}
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
- struct mlx5e_dma_info *di = wi->di;
- u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
- void *va, *data;
+ struct xdp_buff *xdp = wi->di->xsk;
bool consumed;
- u32 frag_size;
- /* wi->offset is not used in this function, because di->xsk.data and
- * di->addr point directly to the necessary place. Furthermore, in the
- * current implementation, one page = one packet = one frame, so
+ /* wi->offset is not used in this function, because xdp->data and the
+ * DMA address point directly to the necessary place. Furthermore, the
+ * XSK allocator allocates frames per packet, instead of pages, so
* wi->offset should always be 0.
*/
WARN_ON_ONCE(wi->offset);
- va = di->xsk.data;
- data = va + rx_headroom;
- frag_size = rq->buff.headroom + cqe_bcnt;
-
- dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
- prefetch(data);
+ xdp->data_end = xdp->data + cqe_bcnt;
+ xdp_set_data_meta_invalid(xdp);
+ xsk_buff_dma_sync_for_cpu(xdp);
+ prefetch(xdp->data);
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
rq->stats->wqe_err++;
}
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
+ consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp);
rcu_read_unlock();
if (likely(consumed))
* will be handled by mlx5e_put_rx_frag.
* On SKB allocation failure, NULL is returned.
*/
- return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
+ return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt);
}
#define __MLX5_EN_XSK_RX_H__
#include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* RX data path */
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi,
u16 cqe_bcnt,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt);
+static inline int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info)
+{
+ dma_info->xsk = xsk_buff_alloc(rq->umem);
+ if (!dma_info->xsk)
+ return -ENOMEM;
+
+ /* Store the DMA address without headroom. In striding RQ case, we just
+ * provide pages for UMR, and headroom is counted at the setup stage
+ * when creating a WQE. In non-striding RQ case, headroom is accounted
+ * in mlx5e_alloc_rx_wqe.
+ */
+ dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk);
+
+ return 0;
+}
+
static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
{
if (!xsk_umem_uses_need_wakeup(rq->umem))
#include "umem.h"
#include "en/xdp.h"
#include "en/params.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
break;
}
- xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
- xdptxd.data = xdp_umem_get_data(umem, desc.addr);
+ xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr);
+ xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr);
xdptxd.len = desc.len;
- dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
- xdptxd.len, DMA_BIDIRECTIONAL);
+ xsk_buff_raw_dma_sync_for_device(umem, xdptxd.dma_addr, xdptxd.len);
if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) {
if (sq->mpwqe.wqe)
#define __MLX5_EN_XSK_TX_H__
#include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
/* TX data path */
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "umem.h"
#include "setup.h"
#include "en/params.h"
struct xdp_umem *umem)
{
struct device *dev = priv->mdev->device;
- u32 i;
- for (i = 0; i < umem->npgs; i++) {
- dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
-
- if (unlikely(dma_mapping_error(dev, dma)))
- goto err_unmap;
- umem->pages[i].dma = dma;
- }
-
- return 0;
-
-err_unmap:
- while (i--) {
- dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
- umem->pages[i].dma = 0;
- }
-
- return -ENOMEM;
+ return xsk_buff_dma_map(umem, dev, 0);
}
static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
struct xdp_umem *umem)
{
- struct device *dev = priv->mdev->device;
- u32 i;
-
- for (i = 0; i < umem->npgs; i++) {
- dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL);
- umem->pages[i].dma = 0;
- }
+ return xsk_buff_dma_unmap(umem, 0);
}
static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
{
- return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
+ return xsk_umem_get_headroom(umem) <= 0xffff &&
+ xsk_umem_get_chunk_size(umem) <= 0xffff;
}
void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
{
- xsk->headroom = umem->headroom;
- xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
+ xsk->headroom = xsk_umem_get_headroom(umem);
+ xsk->chunk_size = xsk_umem_get_chunk_size(umem);
}
static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
mlx5e_xsk_disable_umem(priv, ix);
}
-int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
-{
- struct xdp_umem_fq_reuse *reuseq;
-
- reuseq = xsk_reuseq_prepare(nentries);
- if (unlikely(!reuseq))
- return -ENOMEM;
- xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
- return 0;
-}
-
u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
{
u16 res = xsk->refcnt ? params->num_channels : 0;
#include <linux/bpf.h>
#include <linux/if_bridge.h>
#include <net/page_pool.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "eswitch.h"
#include "en.h"
#include "en/txrx.h"
struct mlx5_core_dev *mdev = c->mdev;
void *rqc = rqp->rqc;
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
- u32 num_xsk_frames = 0;
u32 rq_xdp_ix;
u32 pool_size;
int wq_sz;
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk);
- rq->buff.umem_headroom = xsk ? xsk->headroom : 0;
pool_size = 1 << params->log_rq_mtu_frames;
switch (rq->wq_type) {
wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
- if (xsk)
- num_xsk_frames = wq_sz <<
- mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
-
pool_size = MLX5_MPWRQ_PAGES_PER_WQE <<
mlx5e_mpwqe_get_log_rq_size(params, xsk);
wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
- if (xsk)
- num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags;
-
rq->wqe.info = rqp->frags_info;
rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride;
}
if (xsk) {
- rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem);
-
- err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
- if (unlikely(err)) {
- mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
- num_xsk_frames);
- goto err_free;
- }
-
- rq->zca.free = mlx5e_xsk_zca_free;
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
- MEM_TYPE_ZERO_COPY,
- &rq->zca);
+ MEM_TYPE_XSK_BUFF_POOL, NULL);
+ xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq);
} else {
/* Create a page_pool and register it with rxq */
pp_params.order = 0;
return err;
}
-#ifdef CONFIG_MLX5_ESWITCH
-static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
- struct flow_cls_offload *cls_flower,
- unsigned long flags)
-{
- switch (cls_flower->command) {
- case FLOW_CLS_REPLACE:
- return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
- flags);
- case FLOW_CLS_DESTROY:
- return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
- flags);
- case FLOW_CLS_STATS:
- return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
- flags);
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
- void *cb_priv)
-{
- unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD);
- struct mlx5e_priv *priv = cb_priv;
-
- switch (type) {
- case TC_SETUP_CLSFLOWER:
- return mlx5e_setup_tc_cls_flower(priv, type_data, flags);
- default:
- return -EOPNOTSUPP;
- }
-}
-#endif
-
static LIST_HEAD(mlx5e_block_cb_list);
static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
struct mlx5e_priv *priv = netdev_priv(dev);
switch (type) {
-#ifdef CONFIG_MLX5_ESWITCH
case TC_SETUP_BLOCK: {
struct flow_block_offload *f = type_data;
mlx5e_setup_tc_block_cb,
priv, priv, true);
}
-#endif
case TC_SETUP_QDISC_MQPRIO:
return mlx5e_setup_tc_mqprio(priv, type_data);
default:
return 0;
}
-#ifdef CONFIG_MLX5_ESWITCH
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
set_feature_cvlan_filter);
-#ifdef CONFIG_MLX5_ESWITCH
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters);
#endif
err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
#include <net/switchdev.h>
#include <net/pkt_cls.h>
#include <net/act_api.h>
-#include <net/netevent.h>
#include <net/arp.h>
#include <net/devlink.h>
#include <net/ipv6_stubs.h>
#include "en.h"
#include "en_rep.h"
#include "en_tc.h"
-#include "en/tc_tun.h"
+#include "en/rep/tc.h"
+#include "en/rep/neigh.h"
#include "fs_core.h"
-#include "lib/port_tun.h"
#include "lib/mlx5.h"
#define CREATE_TRACE_POINTS
#include "diag/en_rep_tracepoint.h"
static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
-struct mlx5e_rep_indr_block_priv {
- struct net_device *netdev;
- struct mlx5e_rep_priv *rpriv;
-
- struct list_head list;
-};
-
-static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
- struct net_device *netdev);
-
static void mlx5e_rep_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *drvinfo)
{
mlx5e_sqs2vport_stop(esw, rep);
}
-static unsigned long mlx5e_rep_ipv6_interval(void)
-{
- if (IS_ENABLED(CONFIG_IPV6) && ipv6_stub->nd_tbl)
- return NEIGH_VAR(&ipv6_stub->nd_tbl->parms, DELAY_PROBE_TIME);
-
- return ~0UL;
-}
-
-static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
-{
- unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME);
- unsigned long ipv6_interval = mlx5e_rep_ipv6_interval();
- struct net_device *netdev = rpriv->netdev;
- struct mlx5e_priv *priv = netdev_priv(netdev);
-
- rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
- mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
-}
-
-void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
-{
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
- struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
-
- mlx5_fc_queue_stats_work(priv->mdev,
- &neigh_update->neigh_stats_work,
- neigh_update->min_interval);
-}
-
-static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
-{
- return refcount_inc_not_zero(&nhe->refcnt);
-}
-
-static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe);
-
-static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
-{
- if (refcount_dec_and_test(&nhe->refcnt)) {
- mlx5e_rep_neigh_entry_remove(nhe);
- kfree_rcu(nhe, rcu);
- }
-}
-
-static struct mlx5e_neigh_hash_entry *
-mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv,
- struct mlx5e_neigh_hash_entry *nhe)
-{
- struct mlx5e_neigh_hash_entry *next = NULL;
-
- rcu_read_lock();
-
- for (next = nhe ?
- list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
- &nhe->neigh_list,
- struct mlx5e_neigh_hash_entry,
- neigh_list) :
- list_first_or_null_rcu(&rpriv->neigh_update.neigh_list,
- struct mlx5e_neigh_hash_entry,
- neigh_list);
- next;
- next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
- &next->neigh_list,
- struct mlx5e_neigh_hash_entry,
- neigh_list))
- if (mlx5e_rep_neigh_entry_hold(next))
- break;
-
- rcu_read_unlock();
-
- if (nhe)
- mlx5e_rep_neigh_entry_release(nhe);
-
- return next;
-}
-
-static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
-{
- struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
- neigh_update.neigh_stats_work.work);
- struct net_device *netdev = rpriv->netdev;
- struct mlx5e_priv *priv = netdev_priv(netdev);
- struct mlx5e_neigh_hash_entry *nhe = NULL;
-
- rtnl_lock();
- if (!list_empty(&rpriv->neigh_update.neigh_list))
- mlx5e_rep_queue_neigh_stats_work(priv);
-
- while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL)
- mlx5e_tc_update_neigh_used_value(nhe);
-
- rtnl_unlock();
-}
-
-static void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
- struct mlx5e_encap_entry *e,
- bool neigh_connected,
- unsigned char ha[ETH_ALEN])
-{
- struct ethhdr *eth = (struct ethhdr *)e->encap_header;
- struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- bool encap_connected;
- LIST_HEAD(flow_list);
-
- ASSERT_RTNL();
-
- /* wait for encap to be fully initialized */
- wait_for_completion(&e->res_ready);
-
- mutex_lock(&esw->offloads.encap_tbl_lock);
- encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
- if (e->compl_result < 0 || (encap_connected == neigh_connected &&
- ether_addr_equal(e->h_dest, ha)))
- goto unlock;
-
- mlx5e_take_all_encap_flows(e, &flow_list);
-
- if ((e->flags & MLX5_ENCAP_ENTRY_VALID) &&
- (!neigh_connected || !ether_addr_equal(e->h_dest, ha)))
- mlx5e_tc_encap_flows_del(priv, e, &flow_list);
-
- if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
- ether_addr_copy(e->h_dest, ha);
- ether_addr_copy(eth->h_dest, ha);
- /* Update the encap source mac, in case that we delete
- * the flows when encap source mac changed.
- */
- ether_addr_copy(eth->h_source, e->route_dev->dev_addr);
-
- mlx5e_tc_encap_flows_add(priv, e, &flow_list);
- }
-unlock:
- mutex_unlock(&esw->offloads.encap_tbl_lock);
- mlx5e_put_encap_flow_list(priv, &flow_list);
-}
-
-static void mlx5e_rep_neigh_update(struct work_struct *work)
-{
- struct mlx5e_neigh_hash_entry *nhe =
- container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work);
- struct neighbour *n = nhe->n;
- struct mlx5e_encap_entry *e;
- unsigned char ha[ETH_ALEN];
- struct mlx5e_priv *priv;
- bool neigh_connected;
- u8 nud_state, dead;
-
- rtnl_lock();
-
- /* If these parameters are changed after we release the lock,
- * we'll receive another event letting us know about it.
- * We use this lock to avoid inconsistency between the neigh validity
- * and it's hw address.
- */
- read_lock_bh(&n->lock);
- memcpy(ha, n->ha, ETH_ALEN);
- nud_state = n->nud_state;
- dead = n->dead;
- read_unlock_bh(&n->lock);
-
- neigh_connected = (nud_state & NUD_VALID) && !dead;
-
- trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected);
-
- list_for_each_entry(e, &nhe->encap_list, encap_list) {
- if (!mlx5e_encap_take(e))
- continue;
-
- priv = netdev_priv(e->out_dev);
- mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
- mlx5e_encap_put(priv, e);
- }
- mlx5e_rep_neigh_entry_release(nhe);
- rtnl_unlock();
- neigh_release(n);
-}
-
-static struct mlx5e_rep_indr_block_priv *
-mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv,
- struct net_device *netdev)
-{
- struct mlx5e_rep_indr_block_priv *cb_priv;
-
- /* All callback list access should be protected by RTNL. */
- ASSERT_RTNL();
-
- list_for_each_entry(cb_priv,
- &rpriv->uplink_priv.tc_indr_block_priv_list,
- list)
- if (cb_priv->netdev == netdev)
- return cb_priv;
-
- return NULL;
-}
-
-static void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv)
-{
- struct mlx5e_rep_indr_block_priv *cb_priv, *temp;
- struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list;
-
- list_for_each_entry_safe(cb_priv, temp, head, list) {
- mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev);
- kfree(cb_priv);
- }
-}
-
-static int
-mlx5e_rep_indr_offload(struct net_device *netdev,
- struct flow_cls_offload *flower,
- struct mlx5e_rep_indr_block_priv *indr_priv,
- unsigned long flags)
-{
- struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev);
- int err = 0;
-
- switch (flower->command) {
- case FLOW_CLS_REPLACE:
- err = mlx5e_configure_flower(netdev, priv, flower, flags);
- break;
- case FLOW_CLS_DESTROY:
- err = mlx5e_delete_flower(netdev, priv, flower, flags);
- break;
- case FLOW_CLS_STATS:
- err = mlx5e_stats_flower(netdev, priv, flower, flags);
- break;
- default:
- err = -EOPNOTSUPP;
- }
-
- return err;
-}
-
-static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type,
- void *type_data, void *indr_priv)
-{
- unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
- struct mlx5e_rep_indr_block_priv *priv = indr_priv;
-
- switch (type) {
- case TC_SETUP_CLSFLOWER:
- return mlx5e_rep_indr_offload(priv->netdev, type_data, priv,
- flags);
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type,
- void *type_data, void *indr_priv)
-{
- struct mlx5e_rep_indr_block_priv *priv = indr_priv;
- struct flow_cls_offload *f = type_data;
- struct flow_cls_offload tmp;
- struct mlx5e_priv *mpriv;
- struct mlx5_eswitch *esw;
- unsigned long flags;
- int err;
-
- mpriv = netdev_priv(priv->rpriv->netdev);
- esw = mpriv->mdev->priv.eswitch;
-
- flags = MLX5_TC_FLAG(EGRESS) |
- MLX5_TC_FLAG(ESW_OFFLOAD) |
- MLX5_TC_FLAG(FT_OFFLOAD);
-
- switch (type) {
- case TC_SETUP_CLSFLOWER:
- memcpy(&tmp, f, sizeof(*f));
-
- /* Re-use tc offload path by moving the ft flow to the
- * reserved ft chain.
- *
- * FT offload can use prio range [0, INT_MAX], so we normalize
- * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
- * as with tc, where prio 0 isn't supported.
- *
- * We only support chain 0 of FT offload.
- */
- if (!mlx5_esw_chains_prios_supported(esw) ||
- tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw) ||
- tmp.common.chain_index)
- return -EOPNOTSUPP;
-
- tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
- tmp.common.prio++;
- err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags);
- memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
- return err;
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static void mlx5e_rep_indr_block_unbind(void *cb_priv)
-{
- struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv;
-
- list_del(&indr_priv->list);
- kfree(indr_priv);
-}
-
-static LIST_HEAD(mlx5e_block_cb_list);
-
-static int
-mlx5e_rep_indr_setup_block(struct net_device *netdev,
- struct mlx5e_rep_priv *rpriv,
- struct flow_block_offload *f,
- flow_setup_cb_t *setup_cb)
-{
- struct mlx5e_rep_indr_block_priv *indr_priv;
- struct flow_block_cb *block_cb;
-
- if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
- return -EOPNOTSUPP;
-
- f->unlocked_driver_cb = true;
- f->driver_block_list = &mlx5e_block_cb_list;
-
- switch (f->command) {
- case FLOW_BLOCK_BIND:
- indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
- if (indr_priv)
- return -EEXIST;
-
- indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL);
- if (!indr_priv)
- return -ENOMEM;
-
- indr_priv->netdev = netdev;
- indr_priv->rpriv = rpriv;
- list_add(&indr_priv->list,
- &rpriv->uplink_priv.tc_indr_block_priv_list);
-
- block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv,
- mlx5e_rep_indr_block_unbind);
- if (IS_ERR(block_cb)) {
- list_del(&indr_priv->list);
- kfree(indr_priv);
- return PTR_ERR(block_cb);
- }
- flow_block_cb_add(block_cb, f);
- list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list);
-
- return 0;
- case FLOW_BLOCK_UNBIND:
- indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
- if (!indr_priv)
- return -ENOENT;
-
- block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv);
- if (!block_cb)
- return -ENOENT;
-
- flow_block_cb_remove(block_cb, f);
- list_del(&block_cb->driver_list);
- return 0;
- default:
- return -EOPNOTSUPP;
- }
- return 0;
-}
-
-static
-int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv,
- enum tc_setup_type type, void *type_data)
-{
- switch (type) {
- case TC_SETUP_BLOCK:
- return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
- mlx5e_rep_indr_setup_tc_cb);
- case TC_SETUP_FT:
- return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
- mlx5e_rep_indr_setup_ft_cb);
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
- struct net_device *netdev)
-{
- int err;
-
- err = __flow_indr_block_cb_register(netdev, rpriv,
- mlx5e_rep_indr_setup_cb,
- rpriv);
- if (err) {
- struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
-
- mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n",
- netdev_name(netdev), err);
- }
- return err;
-}
-
-static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
- struct net_device *netdev)
-{
- __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb,
- rpriv);
-}
-
-static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
-{
- struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
- uplink_priv.netdevice_nb);
- struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
- struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
-
- if (!mlx5e_tc_tun_device_to_offload(priv, netdev) &&
- !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev))
- return NOTIFY_OK;
-
- switch (event) {
- case NETDEV_REGISTER:
- mlx5e_rep_indr_register_block(rpriv, netdev);
- break;
- case NETDEV_UNREGISTER:
- mlx5e_rep_indr_unregister_block(rpriv, netdev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static void
-mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv,
- struct mlx5e_neigh_hash_entry *nhe,
- struct neighbour *n)
-{
- /* Take a reference to ensure the neighbour and mlx5 encap
- * entry won't be destructed until we drop the reference in
- * delayed work.
- */
- neigh_hold(n);
-
- /* This assignment is valid as long as the the neigh reference
- * is taken
- */
- nhe->n = n;
-
- if (!queue_work(priv->wq, &nhe->neigh_update_work)) {
- mlx5e_rep_neigh_entry_release(nhe);
- neigh_release(n);
- }
-}
-
-static struct mlx5e_neigh_hash_entry *
-mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
- struct mlx5e_neigh *m_neigh);
-
-static int mlx5e_rep_netevent_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
-{
- struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
- neigh_update.netevent_nb);
- struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
- struct net_device *netdev = rpriv->netdev;
- struct mlx5e_priv *priv = netdev_priv(netdev);
- struct mlx5e_neigh_hash_entry *nhe = NULL;
- struct mlx5e_neigh m_neigh = {};
- struct neigh_parms *p;
- struct neighbour *n;
- bool found = false;
-
- switch (event) {
- case NETEVENT_NEIGH_UPDATE:
- n = ptr;
-#if IS_ENABLED(CONFIG_IPV6)
- if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl)
-#else
- if (n->tbl != &arp_tbl)
-#endif
- return NOTIFY_DONE;
-
- m_neigh.dev = n->dev;
- m_neigh.family = n->ops->family;
- memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
-
- rcu_read_lock();
- nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh);
- rcu_read_unlock();
- if (!nhe)
- return NOTIFY_DONE;
-
- mlx5e_rep_queue_neigh_update_work(priv, nhe, n);
- break;
-
- case NETEVENT_DELAY_PROBE_TIME_UPDATE:
- p = ptr;
-
- /* We check the device is present since we don't care about
- * changes in the default table, we only care about changes
- * done per device delay prob time parameter.
- */
-#if IS_ENABLED(CONFIG_IPV6)
- if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != &arp_tbl))
-#else
- if (!p->dev || p->tbl != &arp_tbl)
-#endif
- return NOTIFY_DONE;
-
- rcu_read_lock();
- list_for_each_entry_rcu(nhe, &neigh_update->neigh_list,
- neigh_list) {
- if (p->dev == nhe->m_neigh.dev) {
- found = true;
- break;
- }
- }
- rcu_read_unlock();
- if (!found)
- return NOTIFY_DONE;
-
- neigh_update->min_interval = min_t(unsigned long,
- NEIGH_VAR(p, DELAY_PROBE_TIME),
- neigh_update->min_interval);
- mlx5_fc_update_sampling_interval(priv->mdev,
- neigh_update->min_interval);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static const struct rhashtable_params mlx5e_neigh_ht_params = {
- .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
- .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
- .key_len = sizeof(struct mlx5e_neigh),
- .automatic_shrinking = true,
-};
-
-static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
-{
- struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
- int err;
-
- err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
- if (err)
- return err;
-
- INIT_LIST_HEAD(&neigh_update->neigh_list);
- mutex_init(&neigh_update->encap_lock);
- INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
- mlx5e_rep_neigh_stats_work);
- mlx5e_rep_neigh_update_init_interval(rpriv);
-
- rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
- err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
- if (err)
- goto out_err;
- return 0;
-
-out_err:
- rhashtable_destroy(&neigh_update->neigh_ht);
- return err;
-}
-
-static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
-{
- struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
- struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
-
- unregister_netevent_notifier(&neigh_update->netevent_nb);
-
- flush_workqueue(priv->wq); /* flush neigh update works */
-
- cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
-
- mutex_destroy(&neigh_update->encap_lock);
- rhashtable_destroy(&neigh_update->neigh_ht);
-}
-
-static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv,
- struct mlx5e_neigh_hash_entry *nhe)
-{
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
- int err;
-
- err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht,
- &nhe->rhash_node,
- mlx5e_neigh_ht_params);
- if (err)
- return err;
-
- list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list);
-
- return err;
-}
-
-static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe)
-{
- struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv;
-
- mutex_lock(&rpriv->neigh_update.encap_lock);
-
- list_del_rcu(&nhe->neigh_list);
-
- rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
- &nhe->rhash_node,
- mlx5e_neigh_ht_params);
- mutex_unlock(&rpriv->neigh_update.encap_lock);
-}
-
-/* This function must only be called under the representor's encap_lock or
- * inside rcu read lock section.
- */
-static struct mlx5e_neigh_hash_entry *
-mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
- struct mlx5e_neigh *m_neigh)
-{
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
- struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
- struct mlx5e_neigh_hash_entry *nhe;
-
- nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh,
- mlx5e_neigh_ht_params);
- return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL;
-}
-
-static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
- struct mlx5e_encap_entry *e,
- struct mlx5e_neigh_hash_entry **nhe)
-{
- int err;
-
- *nhe = kzalloc(sizeof(**nhe), GFP_KERNEL);
- if (!*nhe)
- return -ENOMEM;
-
- (*nhe)->priv = priv;
- memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh));
- INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update);
- spin_lock_init(&(*nhe)->encap_list_lock);
- INIT_LIST_HEAD(&(*nhe)->encap_list);
- refcount_set(&(*nhe)->refcnt, 1);
-
- err = mlx5e_rep_neigh_entry_insert(priv, *nhe);
- if (err)
- goto out_free;
- return 0;
-
-out_free:
- kfree(*nhe);
- return err;
-}
-
-int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
- struct mlx5e_encap_entry *e)
-{
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
- struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
- struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
- struct mlx5e_neigh_hash_entry *nhe;
- int err;
-
- err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type);
- if (err)
- return err;
-
- mutex_lock(&rpriv->neigh_update.encap_lock);
- nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
- if (!nhe) {
- err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
- if (err) {
- mutex_unlock(&rpriv->neigh_update.encap_lock);
- mlx5_tun_entropy_refcount_dec(tun_entropy,
- e->reformat_type);
- return err;
- }
- }
-
- e->nhe = nhe;
- spin_lock(&nhe->encap_list_lock);
- list_add_rcu(&e->encap_list, &nhe->encap_list);
- spin_unlock(&nhe->encap_list_lock);
-
- mutex_unlock(&rpriv->neigh_update.encap_lock);
-
- return 0;
-}
-
-void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
- struct mlx5e_encap_entry *e)
-{
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
- struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
- struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
-
- if (!e->nhe)
- return;
-
- spin_lock(&e->nhe->encap_list_lock);
- list_del_rcu(&e->encap_list);
- spin_unlock(&e->nhe->encap_list_lock);
-
- mlx5e_rep_neigh_entry_release(e->nhe);
- e->nhe = NULL;
- mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type);
-}
-
static int mlx5e_rep_open(struct net_device *dev)
{
struct mlx5e_priv *priv = netdev_priv(dev);
return ret;
}
-static int
-mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
- struct flow_cls_offload *cls_flower, int flags)
-{
- switch (cls_flower->command) {
- case FLOW_CLS_REPLACE:
- return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
- flags);
- case FLOW_CLS_DESTROY:
- return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
- flags);
- case FLOW_CLS_STATS:
- return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
- flags);
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static
-int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv,
- struct tc_cls_matchall_offload *ma)
-{
- switch (ma->command) {
- case TC_CLSMATCHALL_REPLACE:
- return mlx5e_tc_configure_matchall(priv, ma);
- case TC_CLSMATCHALL_DESTROY:
- return mlx5e_tc_delete_matchall(priv, ma);
- case TC_CLSMATCHALL_STATS:
- mlx5e_tc_stats_matchall(priv, ma);
- return 0;
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
- void *cb_priv)
-{
- unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
- struct mlx5e_priv *priv = cb_priv;
-
- switch (type) {
- case TC_SETUP_CLSFLOWER:
- return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags);
- case TC_SETUP_CLSMATCHALL:
- return mlx5e_rep_setup_tc_cls_matchall(priv, type_data);
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data,
- void *cb_priv)
-{
- struct flow_cls_offload tmp, *f = type_data;
- struct mlx5e_priv *priv = cb_priv;
- struct mlx5_eswitch *esw;
- unsigned long flags;
- int err;
-
- flags = MLX5_TC_FLAG(INGRESS) |
- MLX5_TC_FLAG(ESW_OFFLOAD) |
- MLX5_TC_FLAG(FT_OFFLOAD);
- esw = priv->mdev->priv.eswitch;
-
- switch (type) {
- case TC_SETUP_CLSFLOWER:
- memcpy(&tmp, f, sizeof(*f));
-
- if (!mlx5_esw_chains_prios_supported(esw))
- return -EOPNOTSUPP;
-
- /* Re-use tc offload path by moving the ft flow to the
- * reserved ft chain.
- *
- * FT offload can use prio range [0, INT_MAX], so we normalize
- * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
- * as with tc, where prio 0 isn't supported.
- *
- * We only support chain 0 of FT offload.
- */
- if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw))
- return -EOPNOTSUPP;
- if (tmp.common.chain_index != 0)
- return -EOPNOTSUPP;
-
- tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
- tmp.common.prio++;
- err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags);
- memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
- return err;
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static LIST_HEAD(mlx5e_rep_block_tc_cb_list);
-static LIST_HEAD(mlx5e_rep_block_ft_cb_list);
-static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
- void *type_data)
-{
- struct mlx5e_priv *priv = netdev_priv(dev);
- struct flow_block_offload *f = type_data;
-
- f->unlocked_driver_cb = true;
-
- switch (type) {
- case TC_SETUP_BLOCK:
- return flow_block_cb_setup_simple(type_data,
- &mlx5e_rep_block_tc_cb_list,
- mlx5e_rep_setup_tc_cb,
- priv, priv, true);
- case TC_SETUP_FT:
- return flow_block_cb_setup_simple(type_data,
- &mlx5e_rep_block_ft_cb_list,
- mlx5e_rep_setup_ft_cb,
- priv, priv, true);
- default:
- return -EOPNOTSUPP;
- }
-}
-
bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv)
{
struct mlx5e_rep_priv *rpriv = priv->ppriv;
priv = netdev_priv(netdev);
uplink_priv = &rpriv->uplink_priv;
- mutex_init(&uplink_priv->unready_flows_lock);
- INIT_LIST_HEAD(&uplink_priv->unready_flows);
-
- /* init shared tc flow table */
- err = mlx5e_tc_esw_init(&uplink_priv->tc_ht);
+ err = mlx5e_rep_tc_init(rpriv);
if (err)
return err;
mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev);
- /* init indirect block notifications */
- INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list);
- uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event;
- err = register_netdevice_notifier_dev_net(rpriv->netdev,
- &uplink_priv->netdevice_nb,
- &uplink_priv->netdevice_nn);
+ err = mlx5e_rep_tc_netdevice_event_register(rpriv);
if (err) {
- mlx5_core_err(priv->mdev, "Failed to register netdev notifier\n");
- goto tc_esw_cleanup;
+ mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n",
+ err);
+ goto tc_rep_cleanup;
}
return 0;
-tc_esw_cleanup:
- mlx5e_tc_esw_cleanup(&uplink_priv->tc_ht);
+tc_rep_cleanup:
+ mlx5e_rep_tc_cleanup(rpriv);
return err;
}
static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv)
{
- struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
-
- /* clean indirect TC block notifications */
- unregister_netdevice_notifier_dev_net(rpriv->netdev,
- &uplink_priv->netdevice_nb,
- &uplink_priv->netdevice_nn);
+ mlx5e_rep_tc_netdevice_event_unregister(rpriv);
mlx5e_rep_indr_clean_block_privs(rpriv);
- /* delete shared tc flow table */
- mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht);
- mutex_destroy(&rpriv->uplink_priv.unready_flows_lock);
+ mlx5e_rep_tc_cleanup(rpriv);
}
static void mlx5e_cleanup_rep_tx(struct mlx5e_priv *priv)
return NOTIFY_OK;
}
- if (event == MLX5_DEV_EVENT_PORT_AFFINITY) {
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
-
- queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work);
-
- return NOTIFY_OK;
- }
+ if (event == MLX5_DEV_EVENT_PORT_AFFINITY)
+ return mlx5e_rep_tc_event_port_affinity(priv);
return NOTIFY_DONE;
}
{
struct net_device *netdev = priv->netdev;
struct mlx5_core_dev *mdev = priv->mdev;
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
u16 max_mtu;
netdev->min_mtu = ETH_MIN_MTU;
netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
mlx5e_set_dev_port_mtu(priv);
- INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work,
- mlx5e_tc_reoffload_flows_work);
+ mlx5e_rep_tc_enable(priv);
mlx5_lag_add(mdev, netdev);
priv->events_nb.notifier_call = uplink_rep_async_event;
static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
{
struct mlx5_core_dev *mdev = priv->mdev;
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
mlx5e_dcbnl_delete_app(priv);
mlx5_notifier_unregister(mdev, &priv->events_nb);
- cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
+ mlx5e_rep_tc_disable(priv);
mlx5_lag_remove(mdev);
}
enum {
/* set when the encap entry is successfully offloaded into HW */
MLX5_ENCAP_ENTRY_VALID = BIT(0),
+ MLX5_REFORMAT_DECAP = BIT(1),
+};
+
+struct mlx5e_decap_key {
+ struct ethhdr key;
+};
+
+struct mlx5e_decap_entry {
+ struct mlx5e_decap_key key;
+ struct list_head flows;
+ struct hlist_node hlist;
+ refcount_t refcnt;
+ struct completion res_ready;
+ int compl_result;
+ struct mlx5_pkt_reformat *pkt_reformat;
+ struct rcu_head rcu;
};
struct mlx5e_encap_entry {
void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq,
struct mlx5_cqe64 *cqe);
-int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
- struct mlx5e_encap_entry *e);
-void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
- struct mlx5e_encap_entry *e);
-
void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
bool mlx5e_eswitch_rep(struct net_device *netdev);
#include "en_tc.h"
#include "eswitch.h"
#include "en_rep.h"
+#include "en/rep/tc.h"
#include "ipoib/ipoib.h"
#include "en_accel/ipsec_rxtx.h"
#include "en_accel/tls_rxtx.h"
* put into the Reuse Ring, because there is no way to return
* the page to the userspace when the interface goes down.
*/
- mlx5e_xsk_page_release(rq, dma_info);
+ xsk_buff_free(dma_info->xsk);
else
mlx5e_page_release_dynamic(rq, dma_info, recycle);
}
if (rq->umem) {
int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
- if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
+ /* Check in advance that we have enough frames, instead of
+ * allocating one-by-one, failing and moving frames to the
+ * Reuse Ring.
+ */
+ if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired)))
return -ENOMEM;
}
int err;
int i;
+ /* Check in advance that we have enough frames, instead of allocating
+ * one-by-one, failing and moving frames to the Reuse Ring.
+ */
if (rq->umem &&
- unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
+ unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) {
err = -ENOMEM;
goto err;
}
return skb;
}
+static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
+ u32 len, struct xdp_buff *xdp)
+{
+ xdp->data_hard_start = va;
+ xdp_set_data_meta_invalid(xdp);
+ xdp->data = va + headroom;
+ xdp->data_end = xdp->data + len;
+ xdp->rxq = &rq->xdp_rxq;
+ xdp->frame_sz = rq->buff.frame0_sz;
+}
+
struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
{
struct mlx5e_dma_info *di = wi->di;
u16 rx_headroom = rq->buff.headroom;
+ struct xdp_buff xdp;
struct sk_buff *skb;
void *va, *data;
bool consumed;
prefetch(data);
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
+ mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
+ consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp);
rcu_read_unlock();
if (consumed)
return NULL; /* page/packet was consumed by XDP */
+ rx_headroom = xdp.data - xdp.data_hard_start;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
if (unlikely(!skb))
if (rep->vlan && skb_vlan_tag_present(skb))
skb_vlan_pop(skb);
- if (!mlx5e_tc_rep_update_skb(cqe, skb, &tc_priv))
+ if (!mlx5e_rep_tc_update_skb(cqe, skb, &tc_priv))
goto free_wqe;
napi_gro_receive(rq->cq.napi, skb);
- mlx5_tc_rep_post_napi_receive(&tc_priv);
+ mlx5_rep_tc_post_napi_receive(&tc_priv);
free_wqe:
mlx5e_free_rx_wqe(rq, wi, true);
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
- if (!mlx5e_tc_rep_update_skb(cqe, skb, &tc_priv))
+ if (!mlx5e_rep_tc_update_skb(cqe, skb, &tc_priv))
goto mpwrq_cqe_out;
napi_gro_receive(rq->cq.napi, skb);
- mlx5_tc_rep_post_napi_receive(&tc_priv);
+ mlx5_rep_tc_post_napi_receive(&tc_priv);
mpwrq_cqe_out:
if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
u16 rx_headroom = rq->buff.headroom;
u32 cqe_bcnt32 = cqe_bcnt;
+ struct xdp_buff xdp;
struct sk_buff *skb;
void *va, *data;
u32 frag_size;
prefetch(data);
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
+ mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp);
+ consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp);
rcu_read_unlock();
if (consumed) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
return NULL; /* page/packet was consumed by XDP */
}
+ rx_headroom = xdp.data - xdp.data_hard_start;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
if (unlikely(!skb))
*/
#include <net/flow_dissector.h>
+#include <net/flow_offload.h>
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
#include <net/tc_act/tc_gact.h>
#include <net/tc_act/tc_tunnel_key.h>
#include <net/tc_act/tc_pedit.h>
#include <net/tc_act/tc_csum.h>
+#include <net/tc_act/tc_mpls.h>
#include <net/arp.h>
#include <net/ipv6_stubs.h>
+#include <net/bareudp.h>
#include "en.h"
#include "en_rep.h"
+#include "en/rep/tc.h"
+#include "en/rep/neigh.h"
#include "en_tc.h"
#include "eswitch.h"
#include "esw/chains.h"
MLX5E_TC_FLOW_FLAG_NOT_READY = MLX5E_TC_FLOW_BASE + 5,
MLX5E_TC_FLOW_FLAG_DELETED = MLX5E_TC_FLOW_BASE + 6,
MLX5E_TC_FLOW_FLAG_CT = MLX5E_TC_FLOW_BASE + 7,
+ MLX5E_TC_FLOW_FLAG_L3_TO_L2_DECAP = MLX5E_TC_FLOW_BASE + 8,
};
#define MLX5E_TC_MAX_SPLITS 1
u64 cookie;
unsigned long flags;
struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
+
+ /* flows sharing the same reformat object - currently mpls decap */
+ struct list_head l3_to_l2_reformat;
+ struct mlx5e_decap_entry *decap_reformat;
+
/* Flow can be associated with multiple encap IDs.
* The number of encaps is bounded by the number of supported
* destinations.
struct mlx5_flow_spec spec;
struct mlx5e_tc_mod_hdr_acts mod_hdr_acts;
int mirred_ifindex[MLX5_MAX_FLOW_FWD_VPORTS];
+ struct ethhdr eth;
};
#define MLX5E_TC_TABLE_NUM_GROUPS 4
#define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(16)
-struct tunnel_match_key {
- struct flow_dissector_key_control enc_control;
- struct flow_dissector_key_keyid enc_key_id;
- struct flow_dissector_key_ports enc_tp;
- struct flow_dissector_key_ip enc_ip;
- union {
- struct flow_dissector_key_ipv4_addrs enc_ipv4;
- struct flow_dissector_key_ipv6_addrs enc_ipv6;
- };
-
- int filter_ifindex;
-};
-
-struct tunnel_match_enc_opts {
- struct flow_dissector_key_enc_opts key;
- struct flow_dissector_key_enc_opts mask;
-};
-
-/* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS.
- * Upper TUNNEL_INFO_BITS for general tunnel info.
- * Lower ENC_OPTS_BITS bits for enc_opts.
- */
-#define TUNNEL_INFO_BITS 6
-#define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0)
-#define ENC_OPTS_BITS 2
-#define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0)
-#define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS)
-#define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0)
-
struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = {
[CHAIN_TO_REG] = {
.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0,
struct netlink_ext_ack *extack,
struct net_device **encap_dev,
bool *encap_valid);
+static int mlx5e_attach_decap(struct mlx5e_priv *priv,
+ struct mlx5e_tc_flow *flow,
+ struct netlink_ext_ack *extack);
+static void mlx5e_detach_decap(struct mlx5e_priv *priv,
+ struct mlx5e_tc_flow *flow);
static struct mlx5_flow_handle *
mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw,
return -EOPNOTSUPP;
}
+ if (flow_flag_test(flow, L3_TO_L2_DECAP)) {
+ err = mlx5e_attach_decap(priv, flow, extack);
+ if (err)
+ return err;
+ }
+
for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
int mirred_ifindex;
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
mlx5_fc_destroy(attr->counter_dev, attr->counter);
+
+ if (flow_flag_test(flow, L3_TO_L2_DECAP))
+ mlx5e_detach_decap(priv, flow);
}
void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
kfree_rcu(e, rcu);
}
+static void mlx5e_decap_dealloc(struct mlx5e_priv *priv,
+ struct mlx5e_decap_entry *d)
+{
+ WARN_ON(!list_empty(&d->flows));
+
+ if (!d->compl_result)
+ mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat);
+
+ kfree_rcu(d, rcu);
+}
+
void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
mlx5e_encap_dealloc(priv, e);
}
+static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+ if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock))
+ return;
+ hash_del_rcu(&d->hlist);
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+
+ mlx5e_decap_dealloc(priv, d);
+}
+
static void mlx5e_detach_encap(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow, int out_index)
{
mlx5e_encap_dealloc(priv, e);
}
+static void mlx5e_detach_decap(struct mlx5e_priv *priv,
+ struct mlx5e_tc_flow *flow)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5e_decap_entry *d = flow->decap_reformat;
+
+ if (!d)
+ return;
+
+ mutex_lock(&esw->offloads.decap_tbl_lock);
+ list_del(&flow->l3_to_l2_reformat);
+ flow->decap_reformat = NULL;
+
+ if (!refcount_dec_and_test(&d->refcnt)) {
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+ return;
+ }
+ hash_del_rcu(&d->hlist);
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+
+ mlx5e_decap_dealloc(priv, d);
+}
+
static void __mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow)
{
struct mlx5_eswitch *esw = flow->priv->mdev->priv.eswitch;
return err;
}
- flow->esw_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+ /* With mpls over udp we decapsulate using packet reformat
+ * object
+ */
+ if (!netif_is_bareudp(filter_dev))
+ flow->esw_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
}
if (!needs_mapping && !sets_mapping)
return 0;
}
+static bool skip_key_basic(struct net_device *filter_dev,
+ struct flow_cls_offload *f)
+{
+ /* When doing mpls over udp decap, the user needs to provide
+ * MPLS_UC as the protocol in order to be able to match on mpls
+ * label fields. However, the actual ethertype is IP so we want to
+ * avoid matching on this, otherwise we'll fail the match.
+ */
+ if (netif_is_bareudp(filter_dev) && f->common.chain_index == 0)
+ return true;
+
+ return false;
+}
+
static int __parse_cls_flower(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow,
struct mlx5_flow_spec *spec,
BIT(FLOW_DISSECTOR_KEY_IP) |
BIT(FLOW_DISSECTOR_KEY_CT) |
BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
- BIT(FLOW_DISSECTOR_KEY_ENC_OPTS))) {
+ BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) |
+ BIT(FLOW_DISSECTOR_KEY_MPLS))) {
NL_SET_ERR_MSG_MOD(extack, "Unsupported key");
netdev_warn(priv->netdev, "Unsupported key used: 0x%x\n",
dissector->used_keys);
if (err)
return err;
- if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
+ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC) &&
+ !skip_key_basic(filter_dev, f)) {
struct flow_match_basic match;
flow_rule_match_basic(rule, &match);
static const struct pedit_headers zero_masks = {};
-static int parse_tc_pedit_action(struct mlx5e_priv *priv,
- const struct flow_action_entry *act, int namespace,
- struct pedit_headers_action *hdrs,
- struct netlink_ext_ack *extack)
+static int
+parse_pedit_to_modify_hdr(struct mlx5e_priv *priv,
+ const struct flow_action_entry *act, int namespace,
+ struct mlx5e_tc_flow_parse_attr *parse_attr,
+ struct pedit_headers_action *hdrs,
+ struct netlink_ext_ack *extack)
{
u8 cmd = (act->id == FLOW_ACTION_MANGLE) ? 0 : 1;
int err = -EOPNOTSUPP;
return err;
}
+static int
+parse_pedit_to_reformat(struct mlx5e_priv *priv,
+ const struct flow_action_entry *act,
+ struct mlx5e_tc_flow_parse_attr *parse_attr,
+ struct netlink_ext_ack *extack)
+{
+ u32 mask, val, offset;
+ u32 *p;
+
+ if (act->id != FLOW_ACTION_MANGLE)
+ return -EOPNOTSUPP;
+
+ if (act->mangle.htype != FLOW_ACT_MANGLE_HDR_TYPE_ETH) {
+ NL_SET_ERR_MSG_MOD(extack, "Only Ethernet modification is supported");
+ return -EOPNOTSUPP;
+ }
+
+ mask = ~act->mangle.mask;
+ val = act->mangle.val;
+ offset = act->mangle.offset;
+ p = (u32 *)&parse_attr->eth;
+ *(p + (offset >> 2)) |= (val & mask);
+
+ return 0;
+}
+
+static int parse_tc_pedit_action(struct mlx5e_priv *priv,
+ const struct flow_action_entry *act, int namespace,
+ struct mlx5e_tc_flow_parse_attr *parse_attr,
+ struct pedit_headers_action *hdrs,
+ struct mlx5e_tc_flow *flow,
+ struct netlink_ext_ack *extack)
+{
+ if (flow && flow_flag_test(flow, L3_TO_L2_DECAP))
+ return parse_pedit_to_reformat(priv, act, parse_attr, extack);
+
+ return parse_pedit_to_modify_hdr(priv, act, namespace,
+ parse_attr, hdrs, extack);
+}
+
static int alloc_tc_pedit_action(struct mlx5e_priv *priv, int namespace,
struct mlx5e_tc_flow_parse_attr *parse_attr,
struct pedit_headers_action *hdrs,
return -EOPNOTSUPP;
}
- err = parse_tc_pedit_action(priv, &pedit_act, namespace, hdrs, NULL);
+ err = parse_tc_pedit_action(priv, &pedit_act, namespace, parse_attr, hdrs, NULL, extack);
*action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
return err;
case FLOW_ACTION_MANGLE:
case FLOW_ACTION_ADD:
err = parse_tc_pedit_action(priv, act, MLX5_FLOW_NAMESPACE_KERNEL,
- hdrs, extack);
+ parse_attr, hdrs, NULL, extack);
if (err)
return err;
a->tc_tunnel->tunnel_type != b->tc_tunnel->tunnel_type;
}
+static inline int cmp_decap_info(struct mlx5e_decap_key *a,
+ struct mlx5e_decap_key *b)
+{
+ return memcmp(&a->key, &b->key, sizeof(b->key));
+}
+
static inline int hash_encap_info(struct encap_key *key)
{
return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key),
key->tc_tunnel->tunnel_type);
}
+static inline int hash_decap_info(struct mlx5e_decap_key *key)
+{
+ return jhash(&key->key, sizeof(key->key), 0);
+}
static bool is_merged_eswitch_dev(struct mlx5e_priv *priv,
struct net_device *peer_netdev)
same_hw_devs(priv, peer_priv));
}
-
-
bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
{
return refcount_inc_not_zero(&e->refcnt);
}
+static bool mlx5e_decap_take(struct mlx5e_decap_entry *e)
+{
+ return refcount_inc_not_zero(&e->refcnt);
+}
+
static struct mlx5e_encap_entry *
mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key,
uintptr_t hash_key)
return NULL;
}
+static struct mlx5e_decap_entry *
+mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key,
+ uintptr_t hash_key)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5e_decap_key r_key;
+ struct mlx5e_decap_entry *e;
+
+ hash_for_each_possible_rcu(esw->offloads.decap_tbl, e,
+ hlist, hash_key) {
+ r_key = e->key;
+ if (!cmp_decap_info(&r_key, key) &&
+ mlx5e_decap_take(e))
+ return e;
+ }
+ return NULL;
+}
+
static struct ip_tunnel_info *dup_tun_info(const struct ip_tunnel_info *tun_info)
{
size_t tun_size = sizeof(*tun_info) + tun_info->options_len;
return err;
}
+static int mlx5e_attach_decap(struct mlx5e_priv *priv,
+ struct mlx5e_tc_flow *flow,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+ struct mlx5e_tc_flow_parse_attr *parse_attr;
+ struct mlx5e_decap_entry *d;
+ struct mlx5e_decap_key key;
+ uintptr_t hash_key;
+ int err;
+
+ parse_attr = attr->parse_attr;
+ if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "encap header larger than max supported");
+ return -EOPNOTSUPP;
+ }
+
+ key.key = parse_attr->eth;
+ hash_key = hash_decap_info(&key);
+ mutex_lock(&esw->offloads.decap_tbl_lock);
+ d = mlx5e_decap_get(priv, &key, hash_key);
+ if (d) {
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+ wait_for_completion(&d->res_ready);
+ mutex_lock(&esw->offloads.decap_tbl_lock);
+ if (d->compl_result) {
+ err = -EREMOTEIO;
+ goto out_free;
+ }
+ goto found;
+ }
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ d->key = key;
+ refcount_set(&d->refcnt, 1);
+ init_completion(&d->res_ready);
+ INIT_LIST_HEAD(&d->flows);
+ hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+
+ d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
+ MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2,
+ sizeof(parse_attr->eth),
+ &parse_attr->eth,
+ MLX5_FLOW_NAMESPACE_FDB);
+ if (IS_ERR(d->pkt_reformat)) {
+ err = PTR_ERR(d->pkt_reformat);
+ d->compl_result = err;
+ }
+ mutex_lock(&esw->offloads.decap_tbl_lock);
+ complete_all(&d->res_ready);
+ if (err)
+ goto out_free;
+
+found:
+ flow->decap_reformat = d;
+ attr->decap_pkt_reformat = d->pkt_reformat;
+ list_add(&flow->l3_to_l2_reformat, &d->flows);
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+ return 0;
+
+out_free:
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+ mlx5e_decap_put(priv, d);
+ return err;
+
+out_err:
+ mutex_unlock(&esw->offloads.decap_tbl_lock);
+ return err;
+}
+
static int parse_tc_vlan_action(struct mlx5e_priv *priv,
const struct flow_action_entry *act,
struct mlx5_esw_flow_attr *attr,
static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
struct flow_action *flow_action,
struct mlx5e_tc_flow *flow,
- struct netlink_ext_ack *extack)
+ struct netlink_ext_ack *extack,
+ struct net_device *filter_dev)
{
struct pedit_headers_action hdrs[2] = {};
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
bool encap = false, decap = false;
u32 action = attr->action;
int err, i, if_count = 0;
+ bool mpls_push = false;
if (!flow_action_has_entries(flow_action))
return -EINVAL;
action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
MLX5_FLOW_CONTEXT_ACTION_COUNT;
break;
+ case FLOW_ACTION_MPLS_PUSH:
+ if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev,
+ reformat_l2_to_l3_tunnel) ||
+ act->mpls_push.proto != htons(ETH_P_MPLS_UC)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "mpls push is supported only for mpls_uc protocol");
+ return -EOPNOTSUPP;
+ }
+ mpls_push = true;
+ break;
+ case FLOW_ACTION_MPLS_POP:
+ /* we only support mpls pop if it is the first action
+ * and the filter net device is bareudp. Subsequent
+ * actions can be pedit and the last can be mirred
+ * egress redirect.
+ */
+ if (i) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "mpls pop supported only as first action");
+ return -EOPNOTSUPP;
+ }
+ if (!netif_is_bareudp(filter_dev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "mpls pop supported only on bareudp devices");
+ return -EOPNOTSUPP;
+ }
+
+ parse_attr->eth.h_proto = act->mpls_pop.proto;
+ action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+ flow_flag_set(flow, L3_TO_L2_DECAP);
+ break;
case FLOW_ACTION_MANGLE:
case FLOW_ACTION_ADD:
err = parse_tc_pedit_action(priv, act, MLX5_FLOW_NAMESPACE_FDB,
- hdrs, extack);
+ parse_attr, hdrs, flow, extack);
if (err)
return err;
- action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
- attr->split_count = attr->out_count;
+ if (!flow_flag_test(flow, L3_TO_L2_DECAP)) {
+ action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+ attr->split_count = attr->out_count;
+ }
break;
case FLOW_ACTION_CSUM:
if (csum_offload_supported(priv, action,
return -EINVAL;
}
+ if (mpls_push && !netif_is_bareudp(out_dev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "mpls is supported only through a bareudp device");
+ return -EOPNOTSUPP;
+ }
+
if (ft_flow && out_dev == priv->netdev) {
/* Ignore forward to self rules generated
* by adding both mlx5 devs to the flow table
INIT_LIST_HEAD(&flow->encaps[out_index].list);
INIT_LIST_HEAD(&flow->mod_hdr);
INIT_LIST_HEAD(&flow->hairpin);
+ INIT_LIST_HEAD(&flow->l3_to_l2_reformat);
refcount_set(&flow->refcnt, 1);
init_completion(&flow->init_done);
if (err)
goto err_free;
- err = parse_tc_fdb_actions(priv, &rule->action, flow, extack);
+ err = parse_tc_fdb_actions(priv, &rule->action, flow, extack, filter_dev);
if (err)
goto err_free;
mutex_unlock(&rpriv->unready_flows_lock);
}
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
-static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb,
- struct mlx5e_tc_update_priv *tc_priv,
- u32 tunnel_id)
-{
- struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- struct tunnel_match_enc_opts enc_opts = {};
- struct mlx5_rep_uplink_priv *uplink_priv;
- struct mlx5e_rep_priv *uplink_rpriv;
- struct metadata_dst *tun_dst;
- struct tunnel_match_key key;
- u32 tun_id, enc_opts_id;
- struct net_device *dev;
- int err;
-
- enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
- tun_id = tunnel_id >> ENC_OPTS_BITS;
-
- if (!tun_id)
- return true;
-
- uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
- uplink_priv = &uplink_rpriv->uplink_priv;
-
- err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key);
- if (err) {
- WARN_ON_ONCE(true);
- netdev_dbg(priv->netdev,
- "Couldn't find tunnel for tun_id: %d, err: %d\n",
- tun_id, err);
- return false;
- }
-
- if (enc_opts_id) {
- err = mapping_find(uplink_priv->tunnel_enc_opts_mapping,
- enc_opts_id, &enc_opts);
- if (err) {
- netdev_dbg(priv->netdev,
- "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n",
- enc_opts_id, err);
- return false;
- }
- }
-
- tun_dst = tun_rx_dst(enc_opts.key.len);
- if (!tun_dst) {
- WARN_ON_ONCE(true);
- return false;
- }
-
- ip_tunnel_key_init(&tun_dst->u.tun_info.key,
- key.enc_ipv4.src, key.enc_ipv4.dst,
- key.enc_ip.tos, key.enc_ip.ttl,
- 0, /* label */
- key.enc_tp.src, key.enc_tp.dst,
- key32_to_tunnel_id(key.enc_key_id.keyid),
- TUNNEL_KEY);
-
- if (enc_opts.key.len)
- ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
- enc_opts.key.data,
- enc_opts.key.len,
- enc_opts.key.dst_opt_type);
-
- skb_dst_set(skb, (struct dst_entry *)tun_dst);
- dev = dev_get_by_index(&init_net, key.filter_ifindex);
- if (!dev) {
- netdev_dbg(priv->netdev,
- "Couldn't find tunnel device with ifindex: %d\n",
- key.filter_ifindex);
- return false;
+static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
+ struct flow_cls_offload *cls_flower,
+ unsigned long flags)
+{
+ switch (cls_flower->command) {
+ case FLOW_CLS_REPLACE:
+ return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
+ flags);
+ case FLOW_CLS_DESTROY:
+ return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
+ flags);
+ case FLOW_CLS_STATS:
+ return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
+ flags);
+ default:
+ return -EOPNOTSUPP;
}
-
- /* Set tun_dev so we do dev_put() after datapath */
- tc_priv->tun_dev = dev;
-
- skb->dev = dev;
-
- return true;
}
-#endif /* CONFIG_NET_TC_SKB_EXT */
-bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe,
- struct sk_buff *skb,
- struct mlx5e_tc_update_priv *tc_priv)
+int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv)
{
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- u32 chain = 0, reg_c0, reg_c1, tunnel_id, tuple_id;
- struct mlx5_rep_uplink_priv *uplink_priv;
- struct mlx5e_rep_priv *uplink_rpriv;
- struct tc_skb_ext *tc_skb_ext;
- struct mlx5_eswitch *esw;
- struct mlx5e_priv *priv;
- int tunnel_moffset;
- int err;
-
- reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
- if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG)
- reg_c0 = 0;
- reg_c1 = be32_to_cpu(cqe->ft_metadata);
-
- if (!reg_c0)
- return true;
-
- priv = netdev_priv(skb->dev);
- esw = priv->mdev->priv.eswitch;
-
- err = mlx5_eswitch_get_chain_for_tag(esw, reg_c0, &chain);
- if (err) {
- netdev_dbg(priv->netdev,
- "Couldn't find chain for chain tag: %d, err: %d\n",
- reg_c0, err);
- return false;
- }
-
- if (chain) {
- tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
- if (!tc_skb_ext) {
- WARN_ON(1);
- return false;
- }
-
- tc_skb_ext->chain = chain;
+ unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD);
+ struct mlx5e_priv *priv = cb_priv;
- tuple_id = reg_c1 & TUPLE_ID_MAX;
-
- uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
- uplink_priv = &uplink_rpriv->uplink_priv;
- if (!mlx5e_tc_ct_restore_flow(uplink_priv, skb, tuple_id))
- return false;
+ switch (type) {
+ case TC_SETUP_CLSFLOWER:
+ return mlx5e_setup_tc_cls_flower(priv, type_data, flags);
+ default:
+ return -EOPNOTSUPP;
}
-
- tunnel_moffset = mlx5e_tc_attr_to_reg_mappings[TUNNEL_TO_REG].moffset;
- tunnel_id = reg_c1 >> (8 * tunnel_moffset);
- return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
-#endif /* CONFIG_NET_TC_SKB_EXT */
-
- return true;
-}
-
-void mlx5_tc_rep_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv)
-{
- if (tc_priv->tun_dev)
- dev_put(tc_priv->tun_dev);
}
#define __MLX5_EN_TC_H__
#include <net/pkt_cls.h>
+#include "en.h"
#define MLX5E_TC_FLOW_ID_MASK 0x0000ffff
#ifdef CONFIG_MLX5_ESWITCH
+struct tunnel_match_key {
+ struct flow_dissector_key_control enc_control;
+ struct flow_dissector_key_keyid enc_key_id;
+ struct flow_dissector_key_ports enc_tp;
+ struct flow_dissector_key_ip enc_ip;
+ union {
+ struct flow_dissector_key_ipv4_addrs enc_ipv4;
+ struct flow_dissector_key_ipv6_addrs enc_ipv6;
+ };
+
+ int filter_ifindex;
+};
+
+struct tunnel_match_enc_opts {
+ struct flow_dissector_key_enc_opts key;
+ struct flow_dissector_key_enc_opts mask;
+};
+
+/* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS.
+ * Upper TUNNEL_INFO_BITS for general tunnel info.
+ * Lower ENC_OPTS_BITS bits for enc_opts.
+ */
+#define TUNNEL_INFO_BITS 6
+#define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0)
+#define ENC_OPTS_BITS 2
+#define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0)
+#define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS)
+#define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0)
+
enum {
MLX5E_TC_FLAG_INGRESS_BIT,
MLX5E_TC_FLAG_EGRESS_BIT,
#define MLX5_TC_FLAG(flag) BIT(MLX5E_TC_FLAG_##flag##_BIT)
-int mlx5e_tc_nic_init(struct mlx5e_priv *priv);
-void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv);
-
int mlx5e_tc_esw_init(struct rhashtable *tc_ht);
void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht);
struct net_device *tun_dev;
};
-bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe, struct sk_buff *skb,
- struct mlx5e_tc_update_priv *tc_priv);
-
-void mlx5_tc_rep_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv);
-
struct mlx5e_tc_mod_hdr_acts {
int num_actions;
int max_actions;
struct mlx5e_tc_flow;
u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow);
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
+int mlx5e_tc_nic_init(struct mlx5e_priv *priv);
+void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv);
+
+int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv);
+
+#else /* CONFIG_MLX5_CLS_ACT */
+static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {}
+static inline int
+mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{ return -EOPNOTSUPP; }
+#endif /* CONFIG_MLX5_CLS_ACT */
+
#else /* CONFIG_MLX5_ESWITCH */
static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {}
{
return 0;
}
+
+static inline int
+mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{ return -EOPNOTSUPP; }
#endif
#endif /* __MLX5_EN_TC_H__ */
#include "eswitch.h"
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
bool
mlx5_esw_chains_prios_supported(struct mlx5_eswitch *esw);
bool
int
mlx5_eswitch_get_chain_for_tag(struct mlx5_eswitch *esw, u32 tag, u32 *chain);
+#else /* CONFIG_MLX5_CLS_ACT */
+
+static inline struct mlx5_flow_table *
+mlx5_esw_chains_get_table(struct mlx5_eswitch *esw, u32 chain, u32 prio,
+ u32 level) { return ERR_PTR(-EOPNOTSUPP); }
+static inline void
+mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio,
+ u32 level) {}
+
+static inline struct mlx5_flow_table *
+mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw) { return ERR_PTR(-EOPNOTSUPP); }
+
+static inline int mlx5_esw_chains_create(struct mlx5_eswitch *esw) { return 0; }
+static inline void mlx5_esw_chains_destroy(struct mlx5_eswitch *esw) {}
+
+#endif /* CONFIG_MLX5_CLS_ACT */
+
#endif /* __ML5_ESW_CHAINS_H__ */
hash_init(esw->offloads.encap_tbl);
mutex_init(&esw->offloads.mod_hdr.lock);
hash_init(esw->offloads.mod_hdr.hlist);
+ mutex_init(&esw->offloads.decap_tbl_lock);
+ hash_init(esw->offloads.decap_tbl);
atomic64_set(&esw->offloads.num_flows, 0);
mutex_init(&esw->state_lock);
mutex_init(&esw->mode_lock);
mutex_destroy(&esw->state_lock);
mutex_destroy(&esw->offloads.mod_hdr.lock);
mutex_destroy(&esw->offloads.encap_tbl_lock);
+ mutex_destroy(&esw->offloads.decap_tbl_lock);
kfree(esw->vports);
kfree(esw);
}
struct mutex peer_mutex;
struct mutex encap_tbl_lock; /* protects encap_tbl */
DECLARE_HASHTABLE(encap_tbl, 8);
+ struct mutex decap_tbl_lock; /* protects decap_tbl */
+ DECLARE_HASHTABLE(decap_tbl, 8);
struct mod_hdr_tbl mod_hdr;
DECLARE_HASHTABLE(termtbl_tbl, 8);
struct mutex termtbl_mutex; /* protects termtbl hash */
struct mlx5_flow_table *fdb;
struct mlx5_flow_table *dest_ft;
struct mlx5_ct_attr ct_attr;
+ struct mlx5_pkt_reformat *decap_pkt_reformat;
struct mlx5e_tc_flow_parse_attr *parse_attr;
};
}
}
}
+
+ if (attr->decap_pkt_reformat)
+ flow_act.pkt_reformat = attr->decap_pkt_reformat;
+
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
dest[i].counter_id = mlx5_fc_id(attr->counter);
static void mlx5_esw_offloads_unpair(struct mlx5_eswitch *esw)
{
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
mlx5e_tc_clean_fdb_peer_flows(esw);
+#endif
esw_del_fdb_peer_miss_rules(esw);
}
int mlx5_tun_entropy_refcount_inc(struct mlx5_tun_entropy *tun_entropy,
int reformat_type)
{
- /* the default is error for unknown (non VXLAN/GRE tunnel types) */
int err = -EOPNOTSUPP;
mutex_lock(&tun_entropy->lock);
- if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN &&
+ if ((reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN ||
+ reformat_type == MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL) &&
tun_entropy->enabled) {
/* in case entropy calculation is enabled for all tunneling
* types, it is ok for VXLAN, so approve.
return -EOPNOTSUPP;
act = flow_action_first_entry_get(flow_action);
- if (act->hw_stats == FLOW_ACTION_HW_STATS_ANY ||
- act->hw_stats == FLOW_ACTION_HW_STATS_IMMEDIATE) {
+ if (act->hw_stats & FLOW_ACTION_HW_STATS_DISABLED) {
+ /* Nothing to do */
+ } else if (act->hw_stats & FLOW_ACTION_HW_STATS_IMMEDIATE) {
/* Count action is inserted first */
err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack);
if (err)
return err;
- } else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED &&
- act->hw_stats != FLOW_ACTION_HW_STATS_DONT_CARE) {
+ } else {
NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type");
return -EOPNOTSUPP;
}
xdp->data_end = xdp->data + len;
xdp->rxq = &nvchan->xdp_rxq;
xdp->frame_sz = PAGE_SIZE;
- xdp->handle = 0;
memcpy(xdp->data, data, len);
#define ATH8031_PHY_ID 0x004dd074
#define ATH8032_PHY_ID 0x004dd023
#define ATH8035_PHY_ID 0x004dd072
-#define AT803X_PHY_ID_MASK 0xffffffef
+#define AT8030_PHY_ID_MASK 0xffffffef
MODULE_DESCRIPTION("Qualcomm Atheros AR803x PHY driver");
MODULE_AUTHOR("Matus Ujhelyi");
static struct phy_driver at803x_driver[] = {
{
/* Qualcomm Atheros AR8035 */
- .phy_id = ATH8035_PHY_ID,
+ PHY_ID_MATCH_EXACT(ATH8035_PHY_ID),
.name = "Qualcomm Atheros AR8035",
- .phy_id_mask = AT803X_PHY_ID_MASK,
.flags = PHY_POLL_CABLE_TEST,
.probe = at803x_probe,
.remove = at803x_remove,
/* Qualcomm Atheros AR8030 */
.phy_id = ATH8030_PHY_ID,
.name = "Qualcomm Atheros AR8030",
- .phy_id_mask = AT803X_PHY_ID_MASK,
+ .phy_id_mask = AT8030_PHY_ID_MASK,
.probe = at803x_probe,
.remove = at803x_remove,
.config_init = at803x_config_init,
.config_intr = at803x_config_intr,
}, {
/* Qualcomm Atheros AR8031/AR8033 */
- .phy_id = ATH8031_PHY_ID,
+ PHY_ID_MATCH_EXACT(ATH8031_PHY_ID),
.name = "Qualcomm Atheros AR8031/AR8033",
- .phy_id_mask = AT803X_PHY_ID_MASK,
.flags = PHY_POLL_CABLE_TEST,
.probe = at803x_probe,
.remove = at803x_remove,
module_phy_driver(at803x_driver);
static struct mdio_device_id __maybe_unused atheros_tbl[] = {
- { ATH8030_PHY_ID, AT803X_PHY_ID_MASK },
- { ATH8031_PHY_ID, AT803X_PHY_ID_MASK },
+ { ATH8030_PHY_ID, AT8030_PHY_ID_MASK },
+ { PHY_ID_MATCH_EXACT(ATH8031_PHY_ID) },
{ PHY_ID_MATCH_EXACT(ATH8032_PHY_ID) },
- { ATH8035_PHY_ID, AT803X_PHY_ID_MASK },
+ { PHY_ID_MATCH_EXACT(ATH8035_PHY_ID) },
{ PHY_ID_MATCH_EXACT(ATH9331_PHY_ID) },
{ }
};
#define DP83869_RGMII_RX_CLK_DELAY_EN BIT(0)
/* STRAP_STS1 bits */
+#define DP83869_STRAP_OP_MODE_MASK GENMASK(2, 0)
#define DP83869_STRAP_STS1_RESERVED BIT(11)
+#define DP83869_STRAP_MIRROR_ENABLED BIT(12)
/* PHYCTRL bits */
#define DP83869_RX_FIFO_SHIFT 12
DP83869_CFG3_PORT_MIRROR_EN);
}
+static int dp83869_set_strapped_mode(struct phy_device *phydev)
+{
+ struct dp83869_private *dp83869 = phydev->priv;
+ int val;
+
+ val = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_STRAP_STS1);
+ if (val < 0)
+ return val;
+
+ dp83869->mode = val & DP83869_STRAP_OP_MODE_MASK;
+
+ return 0;
+}
+
#ifdef CONFIG_OF_MDIO
static int dp83869_of_init(struct phy_device *phydev)
{
if (dp83869->mode < DP83869_RGMII_COPPER_ETHERNET ||
dp83869->mode > DP83869_SGMII_COPPER_ETHERNET)
return -EINVAL;
+ } else {
+ ret = dp83869_set_strapped_mode(phydev);
+ if (ret)
+ return ret;
}
if (of_property_read_bool(of_node, "ti,max-output-impedance"))
else if (of_property_read_bool(of_node, "ti,min-output-impedance"))
dp83869->io_impedance = DP83869_IO_MUX_CFG_IO_IMPEDANCE_MIN;
- if (of_property_read_bool(of_node, "enet-phy-lane-swap"))
+ if (of_property_read_bool(of_node, "enet-phy-lane-swap")) {
dp83869->port_mirroring = DP83869_PORT_MIRRORING_EN;
- else
- dp83869->port_mirroring = DP83869_PORT_MIRRORING_DIS;
+ } else {
+ /* If the lane swap is not in the DT then check the straps */
+ ret = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_STRAP_STS1);
+ if (ret < 0)
+ return ret;
+ if (ret & DP83869_STRAP_MIRROR_ENABLED)
+ dp83869->port_mirroring = DP83869_PORT_MIRRORING_EN;
+ else
+ dp83869->port_mirroring = DP83869_PORT_MIRRORING_DIS;
+ }
if (of_property_read_u32(of_node, "rx-fifo-depth",
&dp83869->rx_fifo_depth))
#else
static int dp83869_of_init(struct phy_device *phydev)
{
- return 0;
+ return dp83869_set_strapped_mode(phydev);
}
#endif /* CONFIG_OF_MDIO */
#include <net/netns/generic.h>
#include <net/tun_proto.h>
#include <net/vxlan.h>
+#include <net/nexthop.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
u16 state; /* see ndm_state */
__be32 vni;
u16 flags; /* see ndm_flags and below */
+ struct list_head nh_list;
+ struct nexthop __rcu *nh;
+ struct vxlan_dev *vdev;
};
#define NTF_VXLAN_ADDED_BY_USER 0x100
*/
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
{
+ if (rcu_access_pointer(fdb->nh))
+ return NULL;
return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}
static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
+ if (rcu_access_pointer(fdb->nh))
+ return NULL;
return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
}
{
unsigned long now = jiffies;
struct nda_cacheinfo ci;
+ bool send_ip, send_eth;
struct nlmsghdr *nlh;
+ struct nexthop *nh;
struct ndmsg *ndm;
- bool send_ip, send_eth;
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
send_eth = send_ip = true;
+ nh = rcu_dereference_rtnl(fdb->nh);
if (type == RTM_GETNEIGH) {
- send_ip = !vxlan_addr_any(&rdst->remote_ip);
+ if (rdst) {
+ send_ip = !vxlan_addr_any(&rdst->remote_ip);
+ ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
+ } else if (nh) {
+ ndm->ndm_family = nexthop_get_family(nh);
+ }
send_eth = !is_zero_ether_addr(fdb->eth_addr);
- ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
} else
ndm->ndm_family = AF_BRIDGE;
ndm->ndm_state = fdb->state;
ndm->ndm_ifindex = vxlan->dev->ifindex;
ndm->ndm_flags = fdb->flags;
- if (rdst->offloaded)
+ if (rdst && rdst->offloaded)
ndm->ndm_flags |= NTF_OFFLOADED;
ndm->ndm_type = RTN_UNICAST;
if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
goto nla_put_failure;
+ if (nh) {
+ if (nla_put_u32(skb, NDA_NH_ID, nh->id))
+ goto nla_put_failure;
+ } else if (rdst) {
+ if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
+ &rdst->remote_ip))
+ goto nla_put_failure;
+
+ if (rdst->remote_port &&
+ rdst->remote_port != vxlan->cfg.dst_port &&
+ nla_put_be16(skb, NDA_PORT, rdst->remote_port))
+ goto nla_put_failure;
+ if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
+ nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
+ goto nla_put_failure;
+ if (rdst->remote_ifindex &&
+ nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
+ goto nla_put_failure;
+ }
- if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
- goto nla_put_failure;
-
- if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
- nla_put_be16(skb, NDA_PORT, rdst->remote_port))
- goto nla_put_failure;
- if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
- nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
- goto nla_put_failure;
if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
nla_put_u32(skb, NDA_SRC_VNI,
be32_to_cpu(fdb->vni)))
goto nla_put_failure;
- if (rdst->remote_ifindex &&
- nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
- goto nla_put_failure;
ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
ci.ndm_confirmed = 0;
{
int err;
- if (swdev_notify) {
+ if (swdev_notify && rd) {
switch (type) {
case RTM_NEWNEIGH:
err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
}
-static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state,
- __be32 src_vni, __u16 ndm_flags)
+static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac,
+ __u16 state, __be32 src_vni,
+ __u16 ndm_flags)
{
struct vxlan_fdb *f;
f->flags = ndm_flags;
f->updated = f->used = jiffies;
f->vni = src_vni;
+ f->nh = NULL;
+ f->vdev = vxlan;
+ INIT_LIST_HEAD(&f->nh_list);
INIT_LIST_HEAD(&f->remotes);
memcpy(f->eth_addr, mac, ETH_ALEN);
vxlan_fdb_head(vxlan, mac, src_vni));
}
+static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
+ u32 nhid, struct netlink_ext_ack *extack)
+{
+ struct nexthop *old_nh = rtnl_dereference(fdb->nh);
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int err = -EINVAL;
+
+ if (old_nh && old_nh->id == nhid)
+ return 0;
+
+ nh = nexthop_find_by_id(vxlan->net, nhid);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto err_inval;
+ }
+
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ nh = NULL;
+ goto err_inval;
+ }
+ if (!nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
+ goto err_inval;
+ }
+
+ if (!nh->is_group || !nh->nh_grp->mpath) {
+ NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
+ goto err_inval;
+ }
+
+ /* check nexthop group family */
+ nhg = rtnl_dereference(nh->nh_grp);
+ switch (vxlan->default_dst.remote_ip.sa.sa_family) {
+ case AF_INET:
+ if (!nhg->has_v4) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
+ goto err_inval;
+ }
+ break;
+ case AF_INET6:
+ if (nhg->has_v4) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
+ goto err_inval;
+ }
+ }
+ }
+
+ if (old_nh) {
+ list_del_rcu(&fdb->nh_list);
+ nexthop_put(old_nh);
+ }
+ rcu_assign_pointer(fdb->nh, nh);
+ list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
+ return 1;
+
+err_inval:
+ if (nh)
+ nexthop_put(nh);
+ return err;
+}
+
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __be16 port, __be32 src_vni,
__be32 vni, __u32 ifindex, __u16 ndm_flags,
- struct vxlan_fdb **fdb)
+ u32 nhid, struct vxlan_fdb **fdb,
+ struct netlink_ext_ack *extack)
{
struct vxlan_rdst *rd = NULL;
struct vxlan_fdb *f;
return -ENOSPC;
netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
- f = vxlan_fdb_alloc(mac, state, src_vni, ndm_flags);
+ f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
if (!f)
return -ENOMEM;
- rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
- if (rc < 0) {
- kfree(f);
- return rc;
- }
+ if (nhid)
+ rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
+ else
+ rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
+ if (rc < 0)
+ goto errout;
*fdb = f;
return 0;
+
+errout:
+ kfree(f);
+ return rc;
}
static void __vxlan_fdb_free(struct vxlan_fdb *f)
{
struct vxlan_rdst *rd, *nd;
+ struct nexthop *nh;
+
+ nh = rcu_dereference_raw(f->nh);
+ if (nh) {
+ rcu_assign_pointer(f->nh, NULL);
+ list_del_rcu(&f->nh_list);
+ nexthop_put(nh);
+ }
list_for_each_entry_safe(rd, nd, &f->remotes, list) {
dst_cache_destroy(&rd->dst_cache);
netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);
--vxlan->addrcnt;
- if (do_notify)
- list_for_each_entry(rd, &f->remotes, list)
- vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
+ if (do_notify) {
+ if (rcu_access_pointer(f->nh))
+ vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
swdev_notify, NULL);
+ else
+ list_for_each_entry(rd, &f->remotes, list)
+ vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
+ swdev_notify, NULL);
+ }
hlist_del_rcu(&f->hlist);
+ f->vdev = NULL;
call_rcu(&f->rcu, vxlan_fdb_free);
}
__u16 state, __u16 flags,
__be16 port, __be32 vni,
__u32 ifindex, __u16 ndm_flags,
- struct vxlan_fdb *f,
+ struct vxlan_fdb *f, u32 nhid,
bool swdev_notify,
struct netlink_ext_ack *extack)
{
int rc = 0;
int err;
+ if (nhid && !rcu_access_pointer(f->nh)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot replace an existing non nexthop fdb with a nexthop");
+ return -EOPNOTSUPP;
+ }
+
+ if (nhid && (flags & NLM_F_APPEND)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot append to a nexthop fdb");
+ return -EOPNOTSUPP;
+ }
+
/* Do not allow an externally learned entry to take over an entry added
* by the user.
*/
/* Only change unicasts */
if (!(is_multicast_ether_addr(f->eth_addr) ||
is_zero_ether_addr(f->eth_addr))) {
- rc = vxlan_fdb_replace(f, ip, port, vni,
- ifindex, &oldrd);
+ if (nhid) {
+ rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
+ if (rc < 0)
+ return rc;
+ } else {
+ rc = vxlan_fdb_replace(f, ip, port, vni,
+ ifindex, &oldrd);
+ }
notify |= rc;
} else {
+ NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
return -EOPNOTSUPP;
}
}
return 0;
err_notify:
+ if (nhid)
+ return err;
if ((flags & NLM_F_REPLACE) && rc)
*rd = oldrd;
else if ((flags & NLM_F_APPEND) && rc) {
const u8 *mac, union vxlan_addr *ip,
__u16 state, __u16 flags,
__be16 port, __be32 src_vni, __be32 vni,
- __u32 ifindex, __u16 ndm_flags,
+ __u32 ifindex, __u16 ndm_flags, u32 nhid,
bool swdev_notify,
struct netlink_ext_ack *extack)
{
netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
- vni, ifindex, fdb_flags, &f);
+ vni, ifindex, fdb_flags, nhid, &f, extack);
if (rc < 0)
return rc;
const u8 *mac, union vxlan_addr *ip,
__u16 state, __u16 flags,
__be16 port, __be32 src_vni, __be32 vni,
- __u32 ifindex, __u16 ndm_flags,
+ __u32 ifindex, __u16 ndm_flags, u32 nhid,
bool swdev_notify,
struct netlink_ext_ack *extack)
{
return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
vni, ifindex, ndm_flags, f,
- swdev_notify, extack);
+ nhid, swdev_notify, extack);
} else {
if (!(flags & NLM_F_CREATE))
return -ENOENT;
return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
port, src_vni, vni, ifindex,
- ndm_flags, swdev_notify, extack);
+ ndm_flags, nhid, swdev_notify,
+ extack);
}
}
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
- __be32 *vni, u32 *ifindex)
+ __be32 *vni, u32 *ifindex, u32 *nhid)
{
struct net *net = dev_net(vxlan->dev);
int err;
*ifindex = 0;
}
+ if (tb[NDA_NH_ID])
+ *nhid = nla_get_u32(tb[NDA_NH_ID]);
+ else
+ *nhid = 0;
+
return 0;
}
union vxlan_addr ip;
__be16 port;
__be32 src_vni, vni;
- u32 ifindex;
+ u32 ifindex, nhid;
u32 hash_index;
int err;
return -EINVAL;
}
- if (tb[NDA_DST] == NULL)
+ if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
return -EINVAL;
- err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
+ err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
+ &nhid);
if (err)
return err;
err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
port, src_vni, vni, ifindex,
ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
- true, extack);
+ nhid, true, extack);
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
return err;
__be16 port, __be32 src_vni, __be32 vni,
u32 ifindex, bool swdev_notify)
{
- struct vxlan_fdb *f;
struct vxlan_rdst *rd = NULL;
+ struct vxlan_fdb *f;
int err = -ENOENT;
f = vxlan_find_mac(vxlan, addr, src_vni);
struct vxlan_dev *vxlan = netdev_priv(dev);
union vxlan_addr ip;
__be32 src_vni, vni;
- __be16 port;
- u32 ifindex;
+ u32 ifindex, nhid;
u32 hash_index;
+ __be16 port;
int err;
- err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
+ err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
+ &nhid);
if (err)
return err;
hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
struct vxlan_rdst *rd;
+ if (rcu_access_pointer(f->nh)) {
+ err = vxlan_fdb_info(skb, vxlan, f,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI, NULL);
+ if (err < 0)
+ goto out;
+ continue;
+ }
+
list_for_each_entry_rcu(rd, &f->remotes, list) {
if (*idx < cb->args[2])
goto skip;
if (f->state & (NUD_PERMANENT | NUD_NOARP))
return true;
+ /* Don't override an fdb with nexthop with a learnt entry */
+ if (rcu_access_pointer(f->nh))
+ return true;
+
if (net_ratelimit())
netdev_info(dev,
"%pM migrated from %pIS to %pIS\n",
vxlan->cfg.dst_port,
vni,
vxlan->default_dst.remote_vni,
- ifindex, NTF_SELF, true, NULL);
+ ifindex, NTF_SELF, 0, true, NULL);
spin_unlock(&vxlan->hash_lock[hash_index]);
}
kfree_skb(skb);
}
+static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
+ struct vxlan_fdb *f, __be32 vni, bool did_rsc)
+{
+ struct vxlan_rdst nh_rdst;
+ struct nexthop *nh;
+ bool do_xmit;
+ u32 hash;
+
+ memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
+ hash = skb_get_hash(skb);
+
+ rcu_read_lock();
+ nh = rcu_dereference(f->nh);
+ if (!nh) {
+ rcu_read_unlock();
+ goto drop;
+ }
+ do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
+ rcu_read_unlock();
+
+ if (likely(do_xmit))
+ vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
+ else
+ goto drop;
+
+ return;
+
+drop:
+ dev->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+}
+
/* Transmit local packets over Vxlan
*
* Outer IP header inherits ECN and DF from inner header.
}
}
- list_for_each_entry_rcu(rdst, &f->remotes, list) {
- struct sk_buff *skb1;
+ if (rcu_access_pointer(f->nh)) {
+ vxlan_xmit_nh(skb, dev, f,
+ (vni ? : vxlan->default_dst.remote_vni), did_rsc);
+ } else {
+ list_for_each_entry_rcu(rdst, &f->remotes, list) {
+ struct sk_buff *skb1;
- if (!fdst) {
- fdst = rdst;
- continue;
+ if (!fdst) {
+ fdst = rdst;
+ continue;
+ }
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+ if (skb1)
+ vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
}
- skb1 = skb_clone(skb, GFP_ATOMIC);
- if (skb1)
- vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
+ if (fdst)
+ vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
+ else
+ kfree_skb(skb);
}
- if (fdst)
- vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
- else
- kfree_skb(skb);
return NETDEV_TX_OK;
}
dst->remote_vni,
dst->remote_vni,
dst->remote_ifindex,
- NTF_SELF, &f);
+ NTF_SELF, 0, &f, extack);
if (err)
return err;
}
vxlan->cfg.dst_port,
conf.vni, conf.vni,
conf.remote_ifindex,
- NTF_SELF, true, extack);
+ NTF_SELF, 0, true, extack);
if (err) {
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
netdev_adjacent_change_abort(dst->remote_dev,
fdb_info->remote_vni,
fdb_info->remote_ifindex,
NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
- false, extack);
+ 0, false, extack);
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
return err;
.notifier_call = vxlan_switchdev_event,
};
+static int vxlan_nexthop_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct nexthop *nh = ptr;
+ struct vxlan_fdb *fdb, *tmp;
+
+ if (!nh || event != NEXTHOP_EVENT_DEL)
+ return NOTIFY_DONE;
+
+ list_for_each_entry_safe(fdb, tmp, &nh->fdb_list, nh_list)
+ vxlan_fdb_destroy(fdb->vdev, fdb, false, false);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vxlan_nexthop_notifier_block __read_mostly = {
+ .notifier_call = vxlan_nexthop_event,
+};
+
static __net_init int vxlan_init_net(struct net *net)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
for (h = 0; h < PORT_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vn->sock_list[h]);
- return 0;
+ return register_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
}
static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
LIST_HEAD(list);
rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ unregister_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
list_for_each_entry(net, net_list, exit_list)
vxlan_destroy_tunnels(net, &list);
}
#define cgroup_bpf_enabled (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
#include <linux/types.h>
#include <linux/skbuff.h>
+#include <net/rtnetlink.h>
struct bareudp_conf {
__be16 ethertype;
u8 name_assign_type,
struct bareudp_conf *info);
+static inline bool netif_is_bareudp(const struct net_device *dev)
+{
+ return dev->rtnl_link_ops &&
+ !strcmp(dev->rtnl_link_ops->kind, "bareudp");
+}
+
#endif
FLOW_ACTION_HW_STATS_IMMEDIATE_BIT,
FLOW_ACTION_HW_STATS_DELAYED_BIT,
FLOW_ACTION_HW_STATS_DISABLED_BIT,
+
+ FLOW_ACTION_HW_STATS_NUM_BITS
};
enum flow_action_hw_stats {
- FLOW_ACTION_HW_STATS_DONT_CARE = 0,
FLOW_ACTION_HW_STATS_IMMEDIATE =
BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT),
FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT),
FLOW_ACTION_HW_STATS_DELAYED,
FLOW_ACTION_HW_STATS_DISABLED =
BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT),
+ FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1,
};
typedef void (*action_destr)(void *priv);
return false;
action_entry = flow_action_first_entry_get(action);
- if (action_entry->hw_stats == FLOW_ACTION_HW_STATS_DONT_CARE)
- return true;
+
+ /* Zero is not a legal value for hw_stats, catch anyone passing it */
+ WARN_ON_ONCE(!action_entry->hw_stats);
if (!check_allow_bit &&
- action_entry->hw_stats != FLOW_ACTION_HW_STATS_ANY) {
+ ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) {
NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\"");
return false;
} else if (check_allow_bit &&
struct nl_info fc_nlinfo;
struct nlattr *fc_encap;
u16 fc_encap_type;
+ bool fc_is_fdb;
};
struct fib6_node {
unsigned int seq; /* protected by rtnl_mutex */
u32 last_id_allocated;
+ struct atomic_notifier_head notifier_chain;
};
#endif
#define __LINUX_NEXTHOP_H
#include <linux/netdevice.h>
+#include <linux/notifier.h>
#include <linux/route.h>
#include <linux/types.h>
#include <net/ip_fib.h>
u8 nh_family;
u8 nh_protocol;
u8 nh_blackhole;
+ u8 nh_fdb;
u32 nh_flags;
int nh_ifindex;
u8 family;
bool reject_nh;
+ bool fdb_nh;
union {
struct fib_nh_common fib_nhc;
struct rb_node rb_node; /* entry on netns rbtree */
struct list_head fi_list; /* v4 entries using nh */
struct list_head f6i_list; /* v6 entries using nh */
+ struct list_head fdb_list; /* fdb entries using this nh */
struct list_head grp_list; /* nh group entries using this nh */
struct net *net;
u8 protocol; /* app managing this nh */
u8 nh_flags;
bool is_group;
+ bool is_fdb_nh;
refcount_t refcnt;
struct rcu_head rcu;
};
};
+enum nexthop_event_type {
+ NEXTHOP_EVENT_ADD,
+ NEXTHOP_EVENT_DEL
+};
+
+int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
+ enum nexthop_event_type event_type,
+ struct nexthop *nh);
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb);
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
+
/* caller is holding rcu or rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
void nexthop_free_rcu(struct rcu_head *head);
int nexthop_for_each_fib6_nh(struct nexthop *nh,
int (*cb)(struct fib6_nh *nh, void *arg),
void *arg);
+
+static inline int nexthop_get_family(struct nexthop *nh)
+{
+ struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+ return nhi->family;
+}
+
+static inline
+struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
+{
+ struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+ return &nhi->fib_nhc;
+}
+
+static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
+ int hash)
+{
+ struct nh_info *nhi;
+ struct nexthop *nhp;
+
+ nhp = nexthop_select_path(nh, hash);
+ if (unlikely(!nhp))
+ return NULL;
+ nhi = rcu_dereference(nhp->nh_info);
+ return &nhi->fib_nhc;
+}
#endif
#if IS_ENABLED(CONFIG_BRIDGE_MRP)
u8 mrp_port_state; /* MRP_PORT_STATE */
u8 mrp_port_role; /* MRP_PORT_ROLE */
- u8 mrp_ring_state; /* MRP_RING_STATE */
#endif
} u;
};
#include <net/dst_metadata.h>
#include <net/rtnetlink.h>
#include <net/switchdev.h>
+#include <net/nexthop.h>
#define IANA_VXLAN_UDP_PORT 4789
#undef VXLAN_FLAG
}
+static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh,
+ int hash,
+ struct vxlan_rdst *rdst)
+{
+ struct fib_nh_common *nhc;
+
+ nhc = nexthop_path_fdb_result(nh, hash);
+ if (unlikely(!nhc))
+ return false;
+
+ switch (nhc->nhc_gw_family) {
+ case AF_INET:
+ rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4;
+ rdst->remote_ip.sa.sa_family = AF_INET;
+ break;
+ case AF_INET6:
+ rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6;
+ rdst->remote_ip.sa.sa_family = AF_INET6;
+ break;
+ }
+
+ return true;
+}
+
#endif
MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */
MEM_TYPE_PAGE_POOL,
- MEM_TYPE_ZERO_COPY,
+ MEM_TYPE_XSK_BUFF_POOL,
MEM_TYPE_MAX,
};
struct page_pool;
-struct zero_copy_allocator {
- void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
-};
-
struct xdp_rxq_info {
struct net_device *dev;
u32 queue_index;
void *data_end;
void *data_meta;
void *data_hard_start;
- unsigned long handle;
struct xdp_rxq_info *rxq;
u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
};
int metasize;
int headroom;
- if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+ if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
return xdp_convert_zc_to_xdp_frame(xdp);
/* Assure headroom is available for storing info */
struct net_device;
struct xsk_queue;
-
-/* Masks for xdp_umem_page flags.
- * The low 12-bits of the addr will be 0 since this is the page address, so we
- * can use them for flags.
- */
-#define XSK_NEXT_PG_CONTIG_SHIFT 0
-#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
-
-struct xdp_umem_page {
- void *addr;
- dma_addr_t dma;
-};
-
-struct xdp_umem_fq_reuse {
- u32 nentries;
- u32 length;
- u64 handles[];
-};
-
-/* Flags for the umem flags field.
- *
- * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
- * flags. See inlude/uapi/include/linux/if_xdp.h.
- */
-#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
+struct xdp_buff;
struct xdp_umem {
struct xsk_queue *fq;
struct xsk_queue *cq;
- struct xdp_umem_page *pages;
- u64 chunk_mask;
+ struct xsk_buff_pool *pool;
u64 size;
u32 headroom;
- u32 chunk_size_nohr;
+ u32 chunk_size;
struct user_struct *user;
refcount_t users;
struct work_struct work;
u8 flags;
int id;
struct net_device *dev;
- struct xdp_umem_fq_reuse *fq_reuse;
bool zc;
spinlock_t xsk_tx_list_lock;
struct list_head xsk_tx_list;
};
-/* Nodes are linked in the struct xdp_sock map_list field, and used to
- * track which maps a certain socket reside in.
- */
-
struct xsk_map {
struct bpf_map map;
spinlock_t lock; /* Synchronize map updates */
struct xdp_sock *xsk_map[];
};
-struct xsk_map_node {
- struct list_head node;
- struct xsk_map *map;
- struct xdp_sock **map_entry;
-};
-
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
spinlock_t map_list_lock;
};
-struct xdp_buff;
#ifdef CONFIG_XDP_SOCKETS
-int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
-bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
-/* Used from netdev driver */
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
-void xsk_umem_release_addr(struct xdp_umem *umem);
-void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
-void xsk_umem_consume_tx_done(struct xdp_umem *umem);
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
- struct xdp_umem_fq_reuse *newq);
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
-struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
-void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
-bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
- struct xdp_sock **map_entry);
-int xsk_map_inc(struct xsk_map *map);
-void xsk_map_put(struct xsk_map *map);
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(void);
return xs;
}
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
- return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
- return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
- return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
-}
-
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
- unsigned long page_addr;
-
- addr = xsk_umem_add_offset_to_addr(addr);
- page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
-
- return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
- addr = xsk_umem_add_offset_to_addr(addr);
-
- return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
-}
-
-/* Reuse-queue aware version of FILL queue helpers */
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- if (rq->length >= cnt)
- return true;
-
- return xsk_umem_has_addrs(umem, cnt - rq->length);
-}
-
-static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- if (!rq->length)
- return xsk_umem_peek_addr(umem, addr);
-
- *addr = rq->handles[rq->length - 1];
- return addr;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- if (!rq->length)
- xsk_umem_release_addr(umem);
- else
- rq->length--;
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
- struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
- rq->handles[rq->length++] = addr;
-}
-
-/* Handle the offset appropriately depending on aligned or unaligned mode.
- * For unaligned mode, we store the offset in the upper 16-bits of the address.
- * For aligned mode, we simply add the offset to the address.
- */
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
- u64 offset)
-{
- if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
- return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
- else
- return address + offset;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
- return umem->chunk_size_nohr + umem->headroom;
-}
-
#else
+
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
return -ENOTSUPP;
}
-static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
-{
- return false;
-}
-
-static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
- return false;
-}
-
-static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
- return NULL;
-}
-
-static inline void xsk_umem_release_addr(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
-{
-}
-
-static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
- struct xdp_desc *desc)
-{
- return false;
-}
-
-static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
-{
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
- return NULL;
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
- struct xdp_umem *umem,
- struct xdp_umem_fq_reuse *newq)
-{
- return NULL;
-}
-static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
-}
-
-static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
- u16 queue_id)
-{
- return NULL;
-}
-
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
- return 0;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
- return 0;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
- return 0;
-}
-
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
- return NULL;
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
- return 0;
-}
-
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
- return false;
-}
-
-static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
- return NULL;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-}
-
-static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
-{
- return false;
-}
-
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
- u64 offset)
-{
- return 0;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
- return 0;
-}
-
static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
return -EOPNOTSUPP;
{
return NULL;
}
+
#endif /* CONFIG_XDP_SOCKETS */
#endif /* _LINUX_XDP_SOCK_H */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Interface for implementing AF_XDP zero-copy support in drivers.
+ * Copyright(c) 2020 Intel Corporation.
+ */
+
+#ifndef _LINUX_XDP_SOCK_DRV_H
+#define _LINUX_XDP_SOCK_DRV_H
+
+#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#ifdef CONFIG_XDP_SOCKETS
+
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
+void xsk_umem_consume_tx_done(struct xdp_umem *umem);
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+ return XDP_PACKET_HEADROOM + umem->headroom;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+ return umem->chunk_size;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+ return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem);
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+ struct xdp_rxq_info *rxq)
+{
+ xp_set_rxq_info(umem->pool, rxq);
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+ unsigned long attrs)
+{
+ xp_dma_unmap(umem->pool, attrs);
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+ unsigned long attrs)
+{
+ return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ return xp_get_dma(xskb);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ return xp_get_frame_dma(xskb);
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+ return xp_alloc(umem->pool);
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+ return xp_can_alloc(umem->pool, count);
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ xp_free(xskb);
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+ return xp_raw_get_dma(umem->pool, addr);
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+ return xp_raw_get_data(umem->pool, addr);
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ xp_dma_sync_for_cpu(xskb);
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+ dma_addr_t dma,
+ size_t size)
+{
+ xp_dma_sync_for_device(umem->pool, dma, size);
+}
+
+#else
+
+static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+}
+
+static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
+ struct xdp_desc *desc)
+{
+ return false;
+}
+
+static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+}
+
+static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+ u16 queue_id)
+{
+ return NULL;
+}
+
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+ return false;
+}
+
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+ return 0;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+ return 0;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+ return 0;
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+ struct xdp_rxq_info *rxq)
+{
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+ unsigned long attrs)
+{
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+ unsigned long attrs)
+{
+ return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+ return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+ return 0;
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+ return NULL;
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+ return false;
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+ return 0;
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+ return NULL;
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+ dma_addr_t dma,
+ size_t size)
+{
+}
+
+#endif /* CONFIG_XDP_SOCKETS */
+
+#endif /* _LINUX_XDP_SOCK_DRV_H */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. */
+
+#ifndef XSK_BUFF_POOL_H_
+#define XSK_BUFF_POOL_H_
+
+#include <linux/if_xdp.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <net/xdp.h>
+
+struct xsk_buff_pool;
+struct xdp_rxq_info;
+struct xsk_queue;
+struct xdp_desc;
+struct device;
+struct page;
+
+struct xdp_buff_xsk {
+ struct xdp_buff xdp;
+ dma_addr_t dma;
+ dma_addr_t frame_dma;
+ struct xsk_buff_pool *pool;
+ bool unaligned;
+ u64 orig_addr;
+ struct list_head free_list_node;
+};
+
+struct xsk_buff_pool {
+ struct xsk_queue *fq;
+ struct list_head free_list;
+ dma_addr_t *dma_pages;
+ struct xdp_buff_xsk *heads;
+ u64 chunk_mask;
+ u64 addrs_cnt;
+ u32 free_list_cnt;
+ u32 dma_pages_cnt;
+ u32 heads_cnt;
+ u32 free_heads_cnt;
+ u32 headroom;
+ u32 chunk_size;
+ u32 frame_len;
+ bool cheap_dma;
+ bool unaligned;
+ void *addrs;
+ struct device *dev;
+ struct xdp_buff_xsk *free_heads[];
+};
+
+/* AF_XDP core. */
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+ u32 chunk_size, u32 headroom, u64 size,
+ bool unaligned);
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
+void xp_destroy(struct xsk_buff_pool *pool);
+void xp_release(struct xdp_buff_xsk *xskb);
+
+/* AF_XDP, and XDP core. */
+void xp_free(struct xdp_buff_xsk *xskb);
+
+/* AF_XDP ZC drivers, via xdp_sock_buff.h */
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+ unsigned long attrs, struct page **pages, u32 nr_pages);
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool);
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
+static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
+{
+ return xskb->dma;
+}
+
+static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
+{
+ return xskb->frame_dma;
+}
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
+static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
+{
+ if (xskb->pool->cheap_dma)
+ return;
+
+ xp_dma_sync_for_cpu_slow(xskb);
+}
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+ size_t size);
+static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
+ dma_addr_t dma, size_t size)
+{
+ if (pool->cheap_dma)
+ return;
+
+ xp_dma_sync_for_device_slow(pool, dma, size);
+}
+
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+ u64 addr, u32 len)
+{
+ bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
+
+ if (pool->dma_pages_cnt && cross_pg) {
+ return !(pool->dma_pages[addr >> PAGE_SHIFT] &
+ XSK_NEXT_PG_CONTIG_MASK);
+ }
+ return false;
+}
+
+static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
+{
+ return addr & pool->chunk_mask;
+}
+
+static inline u64 xp_unaligned_extract_addr(u64 addr)
+{
+ return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xp_unaligned_extract_offset(u64 addr)
+{
+ return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xp_unaligned_add_offset_to_addr(u64 addr)
+{
+ return xp_unaligned_extract_addr(addr) +
+ xp_unaligned_extract_offset(addr);
+}
+
+#endif /* XSK_BUFF_POOL_H_ */
FN(PAGE_SHARED) \
FN(PAGE_ORDER0) \
FN(PAGE_POOL) \
- FN(ZERO_COPY)
+ FN(XSK_BUFF_POOL)
#define __MEM_TYPE_TP_FN(x) \
TRACE_DEFINE_ENUM(MEM_TYPE_##x);
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ BPF_CGROUP_INET6_GETPEERNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};
NDA_LINK_NETNSID,
NDA_SRC_VNI,
NDA_PROTOCOL, /* Originator of entry */
+ NDA_NH_ID,
__NDA_MAX
};
NHA_GROUPS, /* flag; only return nexthop groups in dump */
NHA_MASTER, /* u32; only return nexthops with given master dev */
+ NHA_FDB, /* flag; nexthop belongs to a bridge fdb */
+ /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
+
__NHA_MAX,
};
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-ifeq ($(CONFIG_XDP_SOCKETS),y)
-obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
-endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
type == PTR_TO_XDP_SOCK;
}
+static bool reg_type_not_null(enum bpf_reg_type type)
+{
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_BTF_ID;
+}
+
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
bool is_jmp32)
{
- if (__is_pointer_value(false, reg))
- return -1;
+ if (__is_pointer_value(false, reg)) {
+ if (!reg_type_not_null(reg->type))
+ return -1;
+
+ /* If pointer is valid tests against zero will fail so we can
+ * use this to direct branch taken.
+ */
+ if (val != 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ return 0;
+ case BPF_JNE:
+ return 1;
+ default:
+ return -1;
+ }
+ }
if (is_jmp32)
return is_branch32_taken(reg, val, opcode);
}
if (pred >= 0) {
- err = mark_chain_precision(env, insn->dst_reg);
+ /* If we get here with a dst_reg pointer type it is because
+ * above is_branch_taken() special cased the 0 comparison.
+ */
+ if (!__is_pointer_value(false, dst_reg))
+ err = mark_chain_precision(env, insn->dst_reg);
if (BPF_SRC(insn->code) == BPF_X && !err)
err = mark_chain_precision(env, insn->src_reg);
if (err)
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
range = tnum_range(1, 1);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_TRACE_FEXIT:
range = tnum_const(0);
break;
- case BPF_TRACE_ITER:
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
return 0;
+ case BPF_TRACE_ITER:
+ break;
default:
return -ENOTSUPP;
}
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-/* XSKMAP used for AF_XDP sockets
- * Copyright(c) 2018 Intel Corporation.
- */
-
-#include <linux/bpf.h>
-#include <linux/capability.h>
-#include <net/xdp_sock.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-int xsk_map_inc(struct xsk_map *map)
-{
- bpf_map_inc(&map->map);
- return 0;
-}
-
-void xsk_map_put(struct xsk_map *map)
-{
- bpf_map_put(&map->map);
-}
-
-static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
- struct xdp_sock **map_entry)
-{
- struct xsk_map_node *node;
- int err;
-
- node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
- if (!node)
- return ERR_PTR(-ENOMEM);
-
- err = xsk_map_inc(map);
- if (err) {
- kfree(node);
- return ERR_PTR(err);
- }
-
- node->map = map;
- node->map_entry = map_entry;
- return node;
-}
-
-static void xsk_map_node_free(struct xsk_map_node *node)
-{
- xsk_map_put(node->map);
- kfree(node);
-}
-
-static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
-{
- spin_lock_bh(&xs->map_list_lock);
- list_add_tail(&node->node, &xs->map_list);
- spin_unlock_bh(&xs->map_list_lock);
-}
-
-static void xsk_map_sock_delete(struct xdp_sock *xs,
- struct xdp_sock **map_entry)
-{
- struct xsk_map_node *n, *tmp;
-
- spin_lock_bh(&xs->map_list_lock);
- list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
- if (map_entry == n->map_entry) {
- list_del(&n->node);
- xsk_map_node_free(n);
- }
- }
- spin_unlock_bh(&xs->map_list_lock);
-}
-
-static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
-{
- struct bpf_map_memory mem;
- int err, numa_node;
- struct xsk_map *m;
- u64 size;
-
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 ||
- attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
- return ERR_PTR(-EINVAL);
-
- numa_node = bpf_map_attr_numa_node(attr);
- size = struct_size(m, xsk_map, attr->max_entries);
-
- err = bpf_map_charge_init(&mem, size);
- if (err < 0)
- return ERR_PTR(err);
-
- m = bpf_map_area_alloc(size, numa_node);
- if (!m) {
- bpf_map_charge_finish(&mem);
- return ERR_PTR(-ENOMEM);
- }
-
- bpf_map_init_from_attr(&m->map, attr);
- bpf_map_charge_move(&m->map.memory, &mem);
- spin_lock_init(&m->lock);
-
- return &m->map;
-}
-
-static void xsk_map_free(struct bpf_map *map)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
-
- bpf_clear_redirect_map(map);
- synchronize_net();
- bpf_map_area_free(m);
-}
-
-static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- u32 index = key ? *(u32 *)key : U32_MAX;
- u32 *next = next_key;
-
- if (index >= m->map.max_entries) {
- *next = 0;
- return 0;
- }
-
- if (index == m->map.max_entries - 1)
- return -ENOENT;
- *next = index + 1;
- return 0;
-}
-
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
-{
- const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
- struct bpf_insn *insn = insn_buf;
-
- *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
- *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
- *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
- *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
- *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *insn++ = BPF_MOV64_IMM(ret, 0);
- return insn - insn_buf;
-}
-
-static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return __xsk_map_lookup_elem(map, *(u32 *)key);
-}
-
-static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
-{
- return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *xs, *old_xs, **map_entry;
- u32 i = *(u32 *)key, fd = *(u32 *)value;
- struct xsk_map_node *node;
- struct socket *sock;
- int err;
-
- if (unlikely(map_flags > BPF_EXIST))
- return -EINVAL;
- if (unlikely(i >= m->map.max_entries))
- return -E2BIG;
-
- sock = sockfd_lookup(fd, &err);
- if (!sock)
- return err;
-
- if (sock->sk->sk_family != PF_XDP) {
- sockfd_put(sock);
- return -EOPNOTSUPP;
- }
-
- xs = (struct xdp_sock *)sock->sk;
-
- if (!xsk_is_setup_for_bpf_map(xs)) {
- sockfd_put(sock);
- return -EOPNOTSUPP;
- }
-
- map_entry = &m->xsk_map[i];
- node = xsk_map_node_alloc(m, map_entry);
- if (IS_ERR(node)) {
- sockfd_put(sock);
- return PTR_ERR(node);
- }
-
- spin_lock_bh(&m->lock);
- old_xs = READ_ONCE(*map_entry);
- if (old_xs == xs) {
- err = 0;
- goto out;
- } else if (old_xs && map_flags == BPF_NOEXIST) {
- err = -EEXIST;
- goto out;
- } else if (!old_xs && map_flags == BPF_EXIST) {
- err = -ENOENT;
- goto out;
- }
- xsk_map_sock_add(xs, node);
- WRITE_ONCE(*map_entry, xs);
- if (old_xs)
- xsk_map_sock_delete(old_xs, map_entry);
- spin_unlock_bh(&m->lock);
- sockfd_put(sock);
- return 0;
-
-out:
- spin_unlock_bh(&m->lock);
- sockfd_put(sock);
- xsk_map_node_free(node);
- return err;
-}
-
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *old_xs, **map_entry;
- int k = *(u32 *)key;
-
- if (k >= map->max_entries)
- return -EINVAL;
-
- spin_lock_bh(&m->lock);
- map_entry = &m->xsk_map[k];
- old_xs = xchg(map_entry, NULL);
- if (old_xs)
- xsk_map_sock_delete(old_xs, map_entry);
- spin_unlock_bh(&m->lock);
-
- return 0;
-}
-
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
- struct xdp_sock **map_entry)
-{
- spin_lock_bh(&map->lock);
- if (READ_ONCE(*map_entry) == xs) {
- WRITE_ONCE(*map_entry, NULL);
- xsk_map_sock_delete(xs, map_entry);
- }
- spin_unlock_bh(&map->lock);
-}
-
-const struct bpf_map_ops xsk_map_ops = {
- .map_alloc = xsk_map_alloc,
- .map_free = xsk_map_free,
- .map_get_next_key = xsk_map_get_next_key,
- .map_lookup_elem = xsk_map_lookup_elem,
- .map_gen_lookup = xsk_map_gen_lookup,
- .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
- .map_update_elem = xsk_map_update_elem,
- .map_delete_elem = xsk_map_delete_elem,
- .map_check_btf = map_check_no_btf,
-};
u32 headroom, u32 tailroom)
{
void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+ u32 user_size = kattr->test.data_size_in;
void *data;
if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
return ERR_PTR(-EINVAL);
+ if (user_size > size)
+ return ERR_PTR(-EMSGSIZE);
+
data = kzalloc(size + headroom + tailroom, GFP_USER);
if (!data)
return ERR_PTR(-ENOMEM);
- if (copy_from_user(data + headroom, data_in, size)) {
+ if (copy_from_user(data + headroom, data_in, user_size)) {
kfree(data);
return ERR_PTR(-EFAULT);
}
/* XDP have extra tailroom as (most) drivers use full page */
max_data_sz = 4096 - headroom - tailroom;
- if (size > max_data_sz)
- return -EINVAL;
data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
if (IS_ERR(data))
return res;
}
+static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex)
+{
+ struct br_mrp *mrp;
+
+ list_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ struct net_bridge_port *p;
+
+ p = rtnl_dereference(mrp->p_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+
+ p = rtnl_dereference(mrp->s_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+ }
+
+ return true;
+}
+
static struct br_mrp *br_mrp_find_port(struct net_bridge *br,
struct net_bridge_port *p)
{
static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
{
struct net_bridge_port *p;
+ u8 state;
/* Stop sending MRP_Test frames */
cancel_delayed_work_sync(&mrp->test_work);
p = rtnl_dereference(mrp->p_port);
if (p) {
spin_lock_bh(&br->lock);
- p->state = BR_STATE_FORWARDING;
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
p->flags &= ~BR_MRP_AWARE;
spin_unlock_bh(&br->lock);
- br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+ br_mrp_port_switchdev_set_state(p, state);
rcu_assign_pointer(mrp->p_port, NULL);
}
p = rtnl_dereference(mrp->s_port);
if (p) {
spin_lock_bh(&br->lock);
- p->state = BR_STATE_FORWARDING;
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
p->flags &= ~BR_MRP_AWARE;
spin_unlock_bh(&br->lock);
- br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+ br_mrp_port_switchdev_set_state(p, state);
rcu_assign_pointer(mrp->s_port, NULL);
}
!br_mrp_get_port(br, instance->s_ifindex))
return -EINVAL;
+ /* It is not possible to have the same port part of multiple rings */
+ if (!br_mrp_unique_ifindex(br, instance->p_ifindex) ||
+ !br_mrp_unique_ifindex(br, instance->s_ifindex))
+ return -EINVAL;
+
mrp = kzalloc(sizeof(*mrp), GFP_KERNEL);
if (!mrp)
return -ENOMEM;
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
break;
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP6_RECVMSG:
break;
struct flow_rule *flow_rule_alloc(unsigned int num_actions)
{
struct flow_rule *rule;
+ int i;
rule = kzalloc(struct_size(rule, action.entries, num_actions),
GFP_KERNEL);
return NULL;
rule->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
return rule;
}
}
const struct nla_policy nda_policy[NDA_MAX+1] = {
+ [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID },
[NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
[NDA_IFINDEX] = { .type = NLA_U32 },
[NDA_MASTER] = { .type = NLA_U32 },
[NDA_PROTOCOL] = { .type = NLA_U8 },
+ [NDA_NH_ID] = { .type = NLA_U32 },
};
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
mutex_unlock(&mem_id_lock);
}
-static void mem_id_disconnect(int id)
-{
- struct xdp_mem_allocator *xa;
-
- mutex_lock(&mem_id_lock);
-
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- WARN(1, "Request remove non-existing id(%d), driver bug?", id);
- return;
- }
-
- trace_mem_disconnect(xa);
-
- if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
-
- mutex_unlock(&mem_id_lock);
-}
-
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
if (id == 0)
return;
- if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
- return mem_id_disconnect(id);
-
if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
rcu_read_lock();
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
xdp_rxq->mem.type = type;
if (!allocator) {
- if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+ if (type == MEM_TYPE_PAGE_POOL)
return -EINVAL; /* Setup time check page_pool req */
return 0;
}
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
- * of xdp_frames/pages in those cases.
+ * of xdp_frames/pages in those cases. This path is never used by the
+ * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of
+ * the switch-statement.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- unsigned long handle)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
{
struct xdp_mem_allocator *xa;
struct page *page;
page = virt_to_page(data); /* Assumes order0 page*/
put_page(page);
break;
- case MEM_TYPE_ZERO_COPY:
- /* NB! Only valid from an xdp_buff! */
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- xa->zc_alloc->free(xa->zc_alloc, handle);
- rcu_read_unlock();
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
break;
}
}
void xdp_return_frame(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+ __xdp_return(xdpf->data, &xdpf->mem, false);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+ __xdp_return(xdpf->data, &xdpf->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
void xdp_return_buff(struct xdp_buff *xdp)
{
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+ __xdp_return(xdp->data, &xdp->rxq->mem, true);
}
-EXPORT_SYMBOL_GPL(xdp_return_buff);
/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
xdpf->metasize = metasize;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- xdp_return_buff(xdp);
+ xsk_buff_free(xdp);
return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
// SPDX-License-Identifier: GPL-2.0-only
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include "netlink.h"
#include "common.h"
#include <linux/sched/signal.h>
#include <linux/net.h>
#include <net/devlink.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/flow_offload.h>
#include <linux/ethtool_netlink.h>
#include <generated/utsrelease.h>
}
EXPORT_SYMBOL(inet_accept);
-
/*
* This does both peername and sockname.
*/
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
}
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+ peer ? BPF_CGROUP_INET4_GETPEERNAME :
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ NULL);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
return sizeof(*sin);
}
[NHA_ENCAP] = { .type = NLA_NESTED },
[NHA_GROUPS] = { .type = NLA_FLAG },
[NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_FDB] = { .type = NLA_FLAG },
};
+static int call_nexthop_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct nexthop *nh)
+{
+ int err;
+
+ err = atomic_notifier_call_chain(&net->nexthop.notifier_chain,
+ event_type, nh);
+ return notifier_to_errno(err);
+}
+
static unsigned int nh_dev_hashfn(unsigned int val)
{
unsigned int mask = NH_DEV_HASHSIZE - 1;
INIT_LIST_HEAD(&nh->fi_list);
INIT_LIST_HEAD(&nh->f6i_list);
INIT_LIST_HEAD(&nh->grp_list);
+ INIT_LIST_HEAD(&nh->fdb_list);
}
return nh;
}
if (nla_put_u32(skb, NHA_ID, nh->id))
goto nla_put_failure;
+ if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
+
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
if (nla_put_flag(skb, NHA_BLACKHOLE))
goto nla_put_failure;
goto out;
- } else {
+ } else if (!nh->is_fdb_nh) {
const struct net_device *dev;
dev = nhi->fib_nhc.nhc_dev;
return true;
}
+static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ if (!nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
+ return -EINVAL;
+ }
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (*nh_family == AF_UNSPEC) {
+ *nh_family = nhi->family;
+ } else if (*nh_family != nhi->family) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
unsigned int len = nla_len(tb[NHA_GROUP]);
+ u8 nh_family = AF_UNSPEC;
struct nexthop_grp *nhg;
unsigned int i, j;
+ u8 nhg_fdb = 0;
if (len & (sizeof(struct nexthop_grp) - 1)) {
NL_SET_ERR_MSG(extack,
}
}
+ if (tb[NHA_FDB])
+ nhg_fdb = 1;
nhg = nla_data(tb[NHA_GROUP]);
for (i = 0; i < len; ++i) {
struct nexthop *nh;
}
if (!valid_group_nh(nh, len, extack))
return -EINVAL;
+
+ if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
+ return -EINVAL;
+
+ if (!nhg_fdb && nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
+ return -EINVAL;
+ }
}
for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
if (!tb[i])
continue;
-
+ if (tb[NHA_FDB])
+ continue;
NL_SET_ERR_MSG(extack,
"No other attributes can be set in nexthop groups");
return -EINVAL;
if (hash > atomic_read(&nhge->upper_bound))
continue;
+ if (nhge->nh->is_fdb_nh)
+ return nhge->nh;
+
/* nexthops always check if it is good and does
* not rely on a sysctl for this behavior
*/
{
struct nh_info *nhi;
+ if (nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ return -EINVAL;
+ }
+
/* fib6_src is unique to a fib6_info and limits the ability to cache
* routes in fib6_nh within a nexthop that is potentially shared
* across multiple fib entries. If the config wants to use source
{
int err = 0;
+ if (nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+
if (nh->is_group) {
struct nh_group *nhg;
bool do_flush = false;
struct fib_info *fi;
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
+
list_for_each_entry(fi, &nh->fi_list, nh_list) {
fi->fib_flags |= RTNH_F_DEAD;
do_flush = true;
nh_group_rebalance(nhg);
}
+ if (cfg->nh_fdb)
+ nh->is_fdb_nh = 1;
+
rcu_assign_pointer(nh->nh_grp, nhg);
return nh;
.fc_encap = cfg->nh_encap,
.fc_encap_type = cfg->nh_encap_type,
};
- u32 tb_id = l3mdev_fib_table(cfg->dev);
+ u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
int err;
err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
goto out;
}
+ if (nh->is_fdb_nh)
+ goto out;
+
/* sets nh_dev if successful */
err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
if (!err) {
.fc_flags = cfg->nh_flags,
.fc_encap = cfg->nh_encap,
.fc_encap_type = cfg->nh_encap_type,
+ .fc_is_fdb = cfg->nh_fdb,
};
int err;
nhi->family = cfg->nh_family;
nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+ if (cfg->nh_fdb)
+ nh->is_fdb_nh = 1;
+
if (cfg->nh_blackhole) {
nhi->reject_nh = 1;
cfg->nh_ifindex = net->loopback_dev->ifindex;
}
/* add the entry to the device based hash */
- nexthop_devhash_add(net, nhi);
+ if (!nh->is_fdb_nh)
+ nexthop_devhash_add(net, nhi);
rcu_assign_pointer(nh->nh_info, nhi);
if (tb[NHA_ID])
cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+ if (tb[NHA_FDB]) {
+ if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
+ goto out;
+ }
+ if (nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
+ goto out;
+ }
+ cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
+ }
+
if (tb[NHA_GROUP]) {
if (nhm->nh_family != AF_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid family for group");
if (tb[NHA_BLACKHOLE]) {
if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
- tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
- NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
+ NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
goto out;
}
goto out;
}
- if (!tb[NHA_OIF]) {
- NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
+ if (!cfg->nh_fdb && !tb[NHA_OIF]) {
+ NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
goto out;
}
- cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
- if (cfg->nh_ifindex)
- cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+ if (!cfg->nh_fdb && tb[NHA_OIF]) {
+ cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+ if (cfg->nh_ifindex)
+ cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
- if (!cfg->dev) {
- NL_SET_ERR_MSG(extack, "Invalid device index");
- goto out;
- } else if (!(cfg->dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack, "Nexthop device is not up");
- err = -ENETDOWN;
- goto out;
- } else if (!netif_carrier_ok(cfg->dev)) {
- NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
- err = -ENETDOWN;
- goto out;
+ if (!cfg->dev) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ goto out;
+ } else if (!(cfg->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ } else if (!netif_carrier_ok(cfg->dev)) {
+ NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+ err = -ENETDOWN;
+ goto out;
+ }
}
err = -EINVAL;
static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
int *master_idx, bool *group_filter,
- struct netlink_callback *cb)
+ bool *fdb_filter, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
struct nlattr *tb[NHA_MAX + 1];
case NHA_GROUPS:
*group_filter = true;
break;
+ case NHA_FDB:
+ *fdb_filter = true;
+ break;
default:
NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
return -EINVAL;
/* rtnl */
static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
{
+ bool group_filter = false, fdb_filter = false;
struct nhmsg *nhm = nlmsg_data(cb->nlh);
int dev_filter_idx = 0, master_idx = 0;
struct net *net = sock_net(skb->sk);
struct rb_root *root = &net->nexthop.rb_root;
- bool group_filter = false;
struct rb_node *node;
int idx = 0, s_idx;
int err;
err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
- &group_filter, cb);
+ &group_filter, &fdb_filter, cb);
if (err < 0)
return err;
.notifier_call = nh_netdev_event,
};
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb);
+}
+EXPORT_SYMBOL(register_nexthop_notifier);
+
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain,
+ nb);
+}
+EXPORT_SYMBOL(unregister_nexthop_notifier);
+
static void __net_exit nexthop_net_exit(struct net *net)
{
rtnl_lock();
net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
if (!net->nexthop.devhash)
return -ENOMEM;
+ ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
return 0;
}
/*
* This does both peername and sockname.
*/
-
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
struct sock *sk = sock->sk;
sin->sin6_addr = np->saddr;
else
sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
sin->sin6_port = inet->inet_sport;
}
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+ peer ? BPF_CGROUP_INET6_GETPEERNAME :
+ BPF_CGROUP_INET6_GETSOCKNAME,
+ NULL);
sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
sk->sk_bound_dev_if);
return sizeof(*sin);
struct ip6_tnl __rcu *collect_md_tun;
};
+static inline int ip6_tnl_mpls_supported(void)
+{
+ return IS_ENABLED(CONFIG_MPLS);
+}
+
static struct net_device_stats *ip6_get_stats(struct net_device *dev)
{
struct pcpu_sw_netstats tmp, sum = { 0 };
return 0;
}
+static int
+mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __u32 rel_info = ntohl(info);
+ int err, rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+
+ err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
+ return err;
+}
+
static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
const struct ipv6hdr *ipv6h,
struct sk_buff *skb)
return IP6_ECN_decapsulate(ipv6h, skb);
}
+static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
+{
+ /* ECN is not supported in AF_MPLS */
+ return 0;
+}
+
__u32 ip6_tnl_get_cap(struct ip6_tnl *t,
const struct in6_addr *laddr,
const struct in6_addr *raddr)
.proto = htons(ETH_P_IP),
};
+static const struct tnl_ptk_info tpi_mpls = {
+ /* no tunnel info required for mplsip6. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+
static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
const struct tnl_ptk_info *tpi,
int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
ip6ip6_dscp_ecn_decapsulate);
}
+static int mplsip6_rcv(struct sk_buff *skb)
+{
+ return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls,
+ mplsip6_dscp_ecn_decapsulate);
+}
+
struct ipv6_tel_txoption {
struct ipv6_txoptions ops;
__u8 dst_opt[8];
ipv6_push_frag_opts(skb, &opt.ops, &proto);
}
+ skb_set_inner_ipproto(skb, proto);
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
EXPORT_SYMBOL(ip6_tnl_xmit);
static inline int
-ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 protocol)
{
struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h;
const struct iphdr *iph;
int encap_limit = -1;
+ __u16 offset;
struct flowi6 fl6;
- __u8 dsfield;
+ __u8 dsfield, orig_dsfield;
__u32 mtu;
u8 tproto;
int err;
- iph = ip_hdr(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
tproto = READ_ONCE(t->parms.proto);
- if (tproto != IPPROTO_IPIP && tproto != 0)
+ if (tproto != protocol && tproto != 0)
return -1;
if (t->parms.collect_md) {
return -1;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
+ fl6.flowi6_proto = protocol;
fl6.saddr = key->u.ipv6.src;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
dsfield = key->tos;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield;
+ break;
+ }
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
+ if (protocol == IPPROTO_IPV6) {
+ offset = ip6_tnl_parse_tlv_enc_lim(skb,
+ skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have
+ * reallocated skb->head
+ */
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
-
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv4_get_dsfield(iph);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
- else
- fl6.flowi6_mark = t->parms.fwmark;
- }
-
- fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
- dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph));
-
- if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
- return -1;
-
- skb_set_inner_ipproto(skb, IPPROTO_IPIP);
-
- err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
- IPPROTO_IPIP);
- if (err != 0) {
- /* XXX: send ICMP error even if DF is not set. */
- if (err == -EMSGSIZE)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- return -1;
- }
-
- return 0;
-}
-
-static inline int
-ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct ip6_tnl *t = netdev_priv(dev);
- struct ipv6hdr *ipv6h;
- int encap_limit = -1;
- __u16 offset;
- struct flowi6 fl6;
- __u8 dsfield;
- __u32 mtu;
- u8 tproto;
- int err;
-
- ipv6h = ipv6_hdr(skb);
- tproto = READ_ONCE(t->parms.proto);
- if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
- ip6_tnl_addr_conflict(t, ipv6h))
- return -1;
-
- if (t->parms.collect_md) {
- struct ip_tunnel_info *tun_info;
- const struct ip_tunnel_key *key;
-
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
- return -1;
- key = &tun_info->key;
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
- fl6.saddr = key->u.ipv6.src;
- fl6.daddr = key->u.ipv6.dst;
- fl6.flowlabel = key->label;
- dsfield = key->tos;
- } else {
- offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
- /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
- ipv6h = ipv6_hdr(skb);
- if (offset > 0) {
- struct ipv6_tlv_tnl_enc_lim *tel;
-
- tel = (void *)&skb_network_header(skb)[offset];
- if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
- return -1;
+ tel = (void *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
}
- encap_limit = tel->encap_limit - 1;
- } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
- encap_limit = t->parms.encap_limit;
}
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
+ fl6.flowi6_proto = protocol;
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv6_get_dsfield(ipv6h);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
- fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
else
fl6.flowi6_mark = t->parms.fwmark;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ }
}
fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
- dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h));
+ dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield);
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
- skb_set_inner_ipproto(skb, IPPROTO_IPV6);
-
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
- IPPROTO_IPV6);
+ protocol);
if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ break;
+ case IPPROTO_IPV6:
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ break;
+ default:
+ break;
+ }
return -1;
}
{
struct ip6_tnl *t = netdev_priv(dev);
struct net_device_stats *stats = &t->dev->stats;
+ u8 ipproto;
int ret;
if (!pskb_inet_may_pull(skb))
switch (skb->protocol) {
case htons(ETH_P_IP):
- ret = ip4ip6_tnl_xmit(skb, dev);
+ ipproto = IPPROTO_IPIP;
break;
case htons(ETH_P_IPV6):
- ret = ip6ip6_tnl_xmit(skb, dev);
+ if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb)))
+ goto tx_err;
+ ipproto = IPPROTO_IPV6;
+ break;
+ case htons(ETH_P_MPLS_UC):
+ ipproto = IPPROTO_MPLS;
break;
default:
goto tx_err;
}
+ ret = ipxip6_tnl_xmit(skb, dev, ipproto);
if (ret < 0)
goto tx_err;
.priority = 1,
};
+static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
+ .handler = mplsip6_rcv,
+ .err_handler = mplsip6_err,
+ .priority = 1,
+};
+
static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list)
{
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
pr_err("%s: can't register ip6ip6\n", __func__);
goto out_ip6ip6;
}
+
+ if (ip6_tnl_mpls_supported()) {
+ err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS);
+ if (err < 0) {
+ pr_err("%s: can't register mplsip6\n", __func__);
+ goto out_mplsip6;
+ }
+ }
+
err = rtnl_link_register(&ip6_link_ops);
if (err < 0)
goto rtnl_link_failed;
return 0;
rtnl_link_failed:
+ if (ip6_tnl_mpls_supported())
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS);
+out_mplsip6:
xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
out_ip6ip6:
xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
pr_info("%s: can't deregister ip6ip6\n", __func__);
+ if (ip6_tnl_mpls_supported() &&
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS))
+ pr_info("%s: can't deregister mplsip6\n", __func__);
unregister_pernet_device(&ip6_tnl_net_ops);
}
#ifdef CONFIG_IPV6_ROUTER_PREF
fib6_nh->last_probe = jiffies;
#endif
+ if (cfg->fc_is_fdb) {
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
+ return 0;
+ }
err = -ENODEV;
if (cfg->fc_ifindex) {
static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly;
static DEFINE_MUTEX(tunnel6_mutex);
+static inline int xfrm6_tunnel_mpls_supported(void)
+{
+ return IS_ENABLED(CONFIG_MPLS);
+}
+
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
{
struct xfrm6_tunnel __rcu **pprev;
mutex_lock(&tunnel6_mutex);
- for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
- (t = rcu_dereference_protected(*pprev,
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
lockdep_is_held(&tunnel6_mutex))) != NULL;
pprev = &t->next) {
if (t->priority > priority)
mutex_lock(&tunnel6_mutex);
- for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
- (t = rcu_dereference_protected(*pprev,
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
lockdep_is_held(&tunnel6_mutex))) != NULL;
pprev = &t->next) {
if (t == handler) {
}
}
+err:
mutex_unlock(&tunnel6_mutex);
synchronize_net();
handler != NULL; \
handler = rcu_dereference(handler->next)) \
+static int tunnelmpls6_rcv(struct sk_buff *skb)
+{
+ struct xfrm6_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
static int tunnel6_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
return -ENOENT;
}
+static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+
+ return -ENOENT;
+}
+
static const struct inet6_protocol tunnel6_protocol = {
.handler = tunnel6_rcv,
.err_handler = tunnel6_err,
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
+static const struct inet6_protocol tunnelmpls6_protocol = {
+ .handler = tunnelmpls6_rcv,
+ .err_handler = tunnelmpls6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
static int __init tunnel6_init(void)
{
if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
return -EAGAIN;
}
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+ return -EAGAIN;
+ }
return 0;
}
pr_err("%s: can't remove protocol\n", __func__);
if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
pr_err("%s: can't remove protocol\n", __func__);
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS))
+ pr_err("%s: can't remove protocol\n", __func__);
}
module_init(tunnel6_init);
dev->type == ARPHRD_IPGRE ||
dev->type == ARPHRD_IP6GRE ||
dev->type == ARPHRD_SIT ||
- dev->type == ARPHRD_TUNNEL) {
+ dev->type == ARPHRD_TUNNEL ||
+ dev->type == ARPHRD_TUNNEL6) {
mdev = mpls_add_dev(dev);
if (IS_ERR(mdev))
return notifier_from_errno(PTR_ERR(mdev));
}
EXPORT_SYMBOL_GPL(psample_group_put);
+#ifdef CONFIG_INET
static int __psample_ip_tun_to_nlattr(struct sk_buff *skb,
struct ip_tunnel_info *tun_info)
{
return sum;
}
+#endif
void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
u32 trunc_size, int in_ifindex, int out_ifindex,
u32 sample_rate)
{
+#ifdef CONFIG_INET
struct ip_tunnel_info *tun_info;
+#endif
struct sk_buff *nl_skb;
int data_len;
int meta_len;
nla_total_size(sizeof(u32)) + /* group_num */
nla_total_size(sizeof(u32)); /* seq */
+#ifdef CONFIG_INET
tun_info = skb_tunnel_info(skb);
if (tun_info)
meta_len += psample_tunnel_meta_len(tun_info);
+#endif
data_len = min(skb->len, trunc_size);
if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
goto error;
}
+#ifdef CONFIG_INET
if (tun_info) {
ret = psample_ip_tun_to_nlattr(nl_skb, tun_info);
if (unlikely(ret < 0))
goto error;
}
+#endif
genlmsg_end(nl_skb, data);
genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
umem->zc = false;
}
-static void xdp_umem_unmap_pages(struct xdp_umem *umem)
-{
- unsigned int i;
-
- for (i = 0; i < umem->npgs; i++)
- if (PageHighMem(umem->pgs[i]))
- vunmap(umem->pages[i].addr);
-}
-
-static int xdp_umem_map_pages(struct xdp_umem *umem)
-{
- unsigned int i;
- void *addr;
-
- for (i = 0; i < umem->npgs; i++) {
- if (PageHighMem(umem->pgs[i]))
- addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
- else
- addr = page_address(umem->pgs[i]);
-
- if (!addr) {
- xdp_umem_unmap_pages(umem);
- return -ENOMEM;
- }
-
- umem->pages[i].addr = addr;
- }
-
- return 0;
-}
-
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
umem->cq = NULL;
}
- xsk_reuseq_destroy(umem);
-
- xdp_umem_unmap_pages(umem);
+ xp_destroy(umem->pool);
xdp_umem_unpin_pages(umem);
- kvfree(umem->pages);
- umem->pages = NULL;
-
xdp_umem_unaccount_pages(umem);
kfree(umem);
}
if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL;
- umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
- : ~((u64)chunk_size - 1);
umem->size = size;
umem->headroom = headroom;
- umem->chunk_size_nohr = chunk_size - headroom;
+ umem->chunk_size = chunk_size;
umem->npgs = size / PAGE_SIZE;
umem->pgs = NULL;
umem->user = NULL;
if (err)
goto out_account;
- umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages),
- GFP_KERNEL_ACCOUNT);
- if (!umem->pages) {
+ umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
+ headroom, size, unaligned_chunks);
+ if (!umem->pool) {
err = -ENOMEM;
goto out_pin;
}
-
- err = xdp_umem_map_pages(umem);
- if (!err)
- return 0;
-
- kvfree(umem->pages);
+ return 0;
out_pin:
xdp_umem_unpin_pages(umem);
#ifndef XDP_UMEM_H_
#define XDP_UMEM_H_
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u16 queue_id, u16 flags);
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/rculist.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "xsk_queue.h"
READ_ONCE(xs->umem->fq);
}
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
- return xskq_cons_has_entries(umem->fq, cnt);
-}
-EXPORT_SYMBOL(xsk_umem_has_addrs);
-
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
- return xskq_cons_peek_addr(umem->fq, addr, umem);
-}
-EXPORT_SYMBOL(xsk_umem_peek_addr);
-
-void xsk_umem_release_addr(struct xdp_umem *umem)
-{
- xskq_cons_release(umem->fq);
-}
-EXPORT_SYMBOL(xsk_umem_release_addr);
-
void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
if (umem->need_wakeup & XDP_WAKEUP_RX)
}
EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
-/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
- * each page. This is only required in copy mode.
- */
-static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
- u32 len, u32 metalen)
+void xp_release(struct xdp_buff_xsk *xskb)
{
- void *to_buf = xdp_umem_get_data(umem, addr);
-
- addr = xsk_umem_add_offset_to_addr(addr);
- if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
- void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
- u64 page_start = addr & ~(PAGE_SIZE - 1);
- u64 first_len = PAGE_SIZE - (addr - page_start);
-
- memcpy(to_buf, from_buf, first_len);
- memcpy(next_pg_addr, from_buf + first_len,
- len + metalen - first_len);
+ xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
+}
- return;
- }
+static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
+{
+ u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
- memcpy(to_buf, from_buf, len + metalen);
+ offset += xskb->pool->headroom;
+ if (!xskb->pool->unaligned)
+ return xskb->orig_addr + offset;
+ return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- u64 offset = xs->umem->headroom;
- u64 addr, memcpy_addr;
- void *from_buf;
- u32 metalen;
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ u64 addr;
int err;
- if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
- len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
+ addr = xp_get_handle(xskb);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len);
+ if (err) {
xs->rx_dropped++;
- return -ENOSPC;
+ return err;
}
- if (unlikely(xdp_data_meta_unsupported(xdp))) {
- from_buf = xdp->data;
- metalen = 0;
- } else {
- from_buf = xdp->data_meta;
- metalen = xdp->data - xdp->data_meta;
- }
+ xp_release(xskb);
+ return 0;
+}
- memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
+{
+ void *from_buf, *to_buf;
+ u32 metalen;
- offset += metalen;
- addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- err = xskq_prod_reserve_desc(xs->rx, addr, len);
- if (!err) {
- xskq_cons_release(xs->umem->fq);
- xdp_return_buff(xdp);
- return 0;
+ if (unlikely(xdp_data_meta_unsupported(from))) {
+ from_buf = from->data;
+ to_buf = to->data;
+ metalen = 0;
+ } else {
+ from_buf = from->data_meta;
+ metalen = from->data - from->data_meta;
+ to_buf = to->data - metalen;
}
- xs->rx_dropped++;
- return err;
+ memcpy(to_buf, from_buf, len + metalen);
}
-static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
+ bool explicit_free)
{
- int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
+ struct xdp_buff *xsk_xdp;
+ int err;
- if (err)
+ if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
xs->rx_dropped++;
+ return -ENOSPC;
+ }
- return err;
+ xsk_xdp = xsk_buff_alloc(xs->umem);
+ if (!xsk_xdp) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ xsk_copy_xdp(xsk_xdp, xdp, len);
+ err = __xsk_rcv_zc(xs, xsk_xdp, len);
+ if (err) {
+ xsk_buff_free(xsk_xdp);
+ return err;
+ }
+ if (explicit_free)
+ xdp_return_buff(xdp);
+ return 0;
}
static bool xsk_is_bound(struct xdp_sock *xs)
return false;
}
-static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
+ bool explicit_free)
{
u32 len;
len = xdp->data_end - xdp->data;
- return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
- __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+ return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
+ __xsk_rcv_zc(xs, xdp, len) :
+ __xsk_rcv(xs, xdp, len, explicit_free);
}
static void xsk_flush(struct xdp_sock *xs)
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
- u32 metalen = xdp->data - xdp->data_meta;
- u32 len = xdp->data_end - xdp->data;
- u64 offset = xs->umem->headroom;
- void *buffer;
- u64 addr;
int err;
spin_lock_bh(&xs->rx_lock);
-
- if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
- err = -EINVAL;
- goto out_unlock;
- }
-
- if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
- len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
- err = -ENOSPC;
- goto out_drop;
- }
-
- addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- buffer = xdp_umem_get_data(xs->umem, addr);
- memcpy(buffer, xdp->data_meta, len + metalen);
-
- addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
- err = xskq_prod_reserve_desc(xs->rx, addr, len);
- if (err)
- goto out_drop;
-
- xskq_cons_release(xs->umem->fq);
- xskq_prod_submit(xs->rx);
-
- spin_unlock_bh(&xs->rx_lock);
-
- xs->sk.sk_data_ready(&xs->sk);
- return 0;
-
-out_drop:
- xs->rx_dropped++;
-out_unlock:
+ err = xsk_rcv(xs, xdp, false);
+ xsk_flush(xs);
spin_unlock_bh(&xs->rx_lock);
return err;
}
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
int err;
- err = xsk_rcv(xs, xdp);
+ err = xsk_rcv(xs, xdp, true);
if (err)
return err;
skb_put(skb, len);
addr = desc.addr;
- buffer = xdp_umem_get_data(xs->umem, addr);
+ buffer = xsk_buff_raw_get_data(xs->umem, addr);
err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
return sock;
}
-/* Check if umem pages are contiguous.
- * If zero-copy mode, use the DMA address to do the page contiguity check
- * For all other modes we use addr (kernel virtual address)
- * Store the result in the low bits of addr.
- */
-static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
-{
- struct xdp_umem_page *pgs = umem->pages;
- int i, is_contig;
-
- for (i = 0; i < umem->npgs - 1; i++) {
- is_contig = (flags & XDP_ZEROCOPY) ?
- (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
- (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
- pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
- }
-}
-
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
goto out_unlock;
} else {
/* This xsk has its own umem. */
- xskq_set_umem(xs->umem->fq, xs->umem->size,
- xs->umem->chunk_mask);
- xskq_set_umem(xs->umem->cq, xs->umem->size,
- xs->umem->chunk_mask);
-
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
goto out_unlock;
-
- xsk_check_page_contiguity(xs->umem, flags);
}
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->queue_id = qid;
- xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
- xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
&xs->umem->cq;
err = xsk_init_queue(entries, q, true);
+ if (optname == XDP_UMEM_FILL_RING)
+ xp_set_fq(xs->umem->pool, *q);
mutex_unlock(&xs->mutex);
return err;
}
#ifndef XSK_H_
#define XSK_H_
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP BIT(1)
+
struct xdp_ring_offset_v1 {
__u64 producer;
__u64 consumer;
struct xdp_ring_offset_v1 cr;
};
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+
+struct xsk_map_node {
+ struct list_head node;
+ struct xsk_map *map;
+ struct xdp_sock **map_entry;
+};
+
static inline struct xdp_sock *xdp_sk(struct sock *sk)
{
return (struct xdp_sock *)sk;
}
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock **map_entry);
+int xsk_map_inc(struct xsk_map *map);
+void xsk_map_put(struct xsk_map *map);
+
#endif /* XSK_H_ */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include <net/xsk_buff_pool.h>
+#include <net/xdp_sock.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/swiotlb.h>
+
+#include "xsk_queue.h"
+
+static void xp_addr_unmap(struct xsk_buff_pool *pool)
+{
+ vunmap(pool->addrs);
+}
+
+static int xp_addr_map(struct xsk_buff_pool *pool,
+ struct page **pages, u32 nr_pages)
+{
+ pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!pool->addrs)
+ return -ENOMEM;
+ return 0;
+}
+
+void xp_destroy(struct xsk_buff_pool *pool)
+{
+ if (!pool)
+ return;
+
+ xp_addr_unmap(pool);
+ kvfree(pool->heads);
+ kvfree(pool);
+}
+
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+ u32 chunk_size, u32 headroom, u64 size,
+ bool unaligned)
+{
+ struct xsk_buff_pool *pool;
+ struct xdp_buff_xsk *xskb;
+ int err;
+ u32 i;
+
+ pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
+ if (!pool)
+ goto out;
+
+ pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
+ if (!pool->heads)
+ goto out;
+
+ pool->chunk_mask = ~((u64)chunk_size - 1);
+ pool->addrs_cnt = size;
+ pool->heads_cnt = chunks;
+ pool->free_heads_cnt = chunks;
+ pool->headroom = headroom;
+ pool->chunk_size = chunk_size;
+ pool->cheap_dma = true;
+ pool->unaligned = unaligned;
+ pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
+ INIT_LIST_HEAD(&pool->free_list);
+
+ for (i = 0; i < pool->free_heads_cnt; i++) {
+ xskb = &pool->heads[i];
+ xskb->pool = pool;
+ xskb->xdp.frame_sz = chunk_size - headroom;
+ pool->free_heads[i] = xskb;
+ }
+
+ err = xp_addr_map(pool, pages, nr_pages);
+ if (!err)
+ return pool;
+
+out:
+ xp_destroy(pool);
+ return NULL;
+}
+
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq)
+{
+ pool->fq = fq;
+}
+
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
+{
+ u32 i;
+
+ for (i = 0; i < pool->heads_cnt; i++)
+ pool->heads[i].xdp.rxq = rxq;
+}
+EXPORT_SYMBOL(xp_set_rxq_info);
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+ dma_addr_t *dma;
+ u32 i;
+
+ if (pool->dma_pages_cnt == 0)
+ return;
+
+ for (i = 0; i < pool->dma_pages_cnt; i++) {
+ dma = &pool->dma_pages[i];
+ if (*dma) {
+ dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, attrs);
+ *dma = 0;
+ }
+ }
+
+ kvfree(pool->dma_pages);
+ pool->dma_pages_cnt = 0;
+ pool->dev = NULL;
+}
+EXPORT_SYMBOL(xp_dma_unmap);
+
+static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
+{
+ u32 i;
+
+ for (i = 0; i < pool->dma_pages_cnt - 1; i++) {
+ if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1])
+ pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+ else
+ pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+ }
+}
+
+static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_SWIOTLB)
+ phys_addr_t paddr;
+ u32 i;
+
+ for (i = 0; i < pool->dma_pages_cnt; i++) {
+ paddr = dma_to_phys(pool->dev, pool->dma_pages[i]);
+ if (is_swiotlb_buffer(paddr))
+ return false;
+ }
+#endif
+ return true;
+}
+
+static bool xp_check_cheap_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_HAS_DMA)
+ const struct dma_map_ops *ops = get_dma_ops(pool->dev);
+
+ if (ops) {
+ return !ops->sync_single_for_cpu &&
+ !ops->sync_single_for_device;
+ }
+
+ if (!dma_is_direct(ops))
+ return false;
+
+ if (!xp_check_swiotlb_dma(pool))
+ return false;
+
+ if (!dev_is_dma_coherent(pool->dev)) {
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
+ return false;
+#endif
+ }
+#endif
+ return true;
+}
+
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+ unsigned long attrs, struct page **pages, u32 nr_pages)
+{
+ dma_addr_t dma;
+ u32 i;
+
+ pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
+ GFP_KERNEL);
+ if (!pool->dma_pages)
+ return -ENOMEM;
+
+ pool->dev = dev;
+ pool->dma_pages_cnt = nr_pages;
+
+ for (i = 0; i < pool->dma_pages_cnt; i++) {
+ dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL, attrs);
+ if (dma_mapping_error(dev, dma)) {
+ xp_dma_unmap(pool, attrs);
+ return -ENOMEM;
+ }
+ pool->dma_pages[i] = dma;
+ }
+
+ if (pool->unaligned)
+ xp_check_dma_contiguity(pool);
+
+ pool->dev = dev;
+ pool->cheap_dma = xp_check_cheap_dma(pool);
+ return 0;
+}
+EXPORT_SYMBOL(xp_dma_map);
+
+static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+ u64 addr)
+{
+ return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
+}
+
+static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+ *addr = xp_unaligned_extract_addr(*addr);
+ if (*addr >= pool->addrs_cnt ||
+ *addr + pool->chunk_size > pool->addrs_cnt ||
+ xp_addr_crosses_non_contig_pg(pool, *addr))
+ return false;
+ return true;
+}
+
+static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+ *addr = xp_aligned_extract_addr(pool, *addr);
+ return *addr < pool->addrs_cnt;
+}
+
+static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
+{
+ struct xdp_buff_xsk *xskb;
+ u64 addr;
+ bool ok;
+
+ if (pool->free_heads_cnt == 0)
+ return NULL;
+
+ xskb = pool->free_heads[--pool->free_heads_cnt];
+
+ for (;;) {
+ if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
+ xp_release(xskb);
+ return NULL;
+ }
+
+ ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+ xp_check_aligned(pool, &addr);
+ if (!ok) {
+ pool->fq->invalid_descs++;
+ xskq_cons_release(pool->fq);
+ continue;
+ }
+ break;
+ }
+ xskq_cons_release(pool->fq);
+
+ xskb->orig_addr = addr;
+ xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
+ if (pool->dma_pages_cnt) {
+ xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
+ ~XSK_NEXT_PG_CONTIG_MASK) +
+ (addr & ~PAGE_MASK);
+ xskb->dma = xskb->frame_dma + pool->headroom +
+ XDP_PACKET_HEADROOM;
+ }
+ return xskb;
+}
+
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+{
+ struct xdp_buff_xsk *xskb;
+
+ if (!pool->free_list_cnt) {
+ xskb = __xp_alloc(pool);
+ if (!xskb)
+ return NULL;
+ } else {
+ pool->free_list_cnt--;
+ xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
+ free_list_node);
+ list_del(&xskb->free_list_node);
+ }
+
+ xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+ xskb->xdp.data_meta = xskb->xdp.data;
+
+ if (!pool->cheap_dma) {
+ dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+ pool->frame_len,
+ DMA_BIDIRECTIONAL);
+ }
+ return &xskb->xdp;
+}
+EXPORT_SYMBOL(xp_alloc);
+
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
+{
+ if (pool->free_list_cnt >= count)
+ return true;
+ return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
+}
+EXPORT_SYMBOL(xp_can_alloc);
+
+void xp_free(struct xdp_buff_xsk *xskb)
+{
+ xskb->pool->free_list_cnt++;
+ list_add(&xskb->free_list_node, &xskb->pool->free_list);
+}
+EXPORT_SYMBOL(xp_free);
+
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+{
+ addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+ return pool->addrs + addr;
+}
+EXPORT_SYMBOL(xp_raw_get_data);
+
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+{
+ addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+ return (pool->dma_pages[addr >> PAGE_SHIFT] &
+ ~XSK_NEXT_PG_CONTIG_MASK) +
+ (addr & ~PAGE_MASK);
+}
+EXPORT_SYMBOL(xp_raw_get_dma);
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
+{
+ dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+ xskb->pool->frame_len, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+ size_t size)
+{
+ dma_sync_single_range_for_device(pool->dev, dma, 0,
+ size, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
du.id = umem->id;
du.size = umem->size;
du.num_pages = umem->npgs;
- du.chunk_size = umem->chunk_size_nohr + umem->headroom;
+ du.chunk_size = umem->chunk_size;
du.headroom = umem->headroom;
du.ifindex = umem->dev ? umem->dev->ifindex : 0;
du.queue_id = umem->queue_id;
#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/overflow.h>
+#include <net/xdp_sock_drv.h>
#include "xsk_queue.h"
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
-{
- if (!q)
- return;
-
- q->umem_size = umem_size;
- q->chunk_mask = chunk_mask;
-}
-
static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
{
struct xdp_umem_ring *umem_ring;
page_frag_free(q->ring);
kfree(q);
}
-
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
- struct xdp_umem_fq_reuse *newq;
-
- /* Check for overflow */
- if (nentries > (u32)roundup_pow_of_two(nentries))
- return NULL;
- nentries = roundup_pow_of_two(nentries);
-
- newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
- if (!newq)
- return NULL;
- memset(newq, 0, offsetof(typeof(*newq), handles));
-
- newq->nentries = nentries;
- return newq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
-
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
- struct xdp_umem_fq_reuse *newq)
-{
- struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
-
- if (!oldq) {
- umem->fq_reuse = newq;
- return NULL;
- }
-
- if (newq->nentries < oldq->length)
- return newq;
-
- memcpy(newq->handles, oldq->handles,
- array_size(oldq->length, sizeof(u64)));
- newq->length = oldq->length;
-
- umem->fq_reuse = newq;
- return oldq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
-
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
- kvfree(rq);
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_free);
-
-void xsk_reuseq_destroy(struct xdp_umem *umem)
-{
- xsk_reuseq_free(umem->fq_reuse);
- umem->fq_reuse = NULL;
-}
#include <linux/types.h>
#include <linux/if_xdp.h>
#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#include "xsk.h"
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
};
struct xsk_queue {
- u64 chunk_mask;
- u64 umem_size;
u32 ring_mask;
u32 nentries;
u32 cached_prod;
/* Functions that read and validate content from consumer rings. */
-static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem,
- u64 addr,
- u64 length)
+static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
{
- bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
- bool next_pg_contig =
- (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
- XSK_NEXT_PG_CONTIG_MASK;
-
- return cross_pg && !next_pg_contig;
-}
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
- u64 addr,
- u64 length,
- struct xdp_umem *umem)
-{
- u64 base_addr = xsk_umem_extract_addr(addr);
+ if (q->cached_cons != q->cached_prod) {
+ u32 idx = q->cached_cons & q->ring_mask;
- addr = xsk_umem_add_offset_to_addr(addr);
- if (base_addr >= q->umem_size || addr >= q->umem_size ||
- xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
- q->invalid_descs++;
- return false;
+ *addr = ring->desc[idx];
+ return true;
}
- return true;
+ return false;
}
-static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
+static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
{
- if (addr >= q->umem_size) {
- q->invalid_descs++;
+ u64 chunk, chunk_end;
+
+ chunk = xp_aligned_extract_addr(pool, desc->addr);
+ chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
+ if (chunk != chunk_end)
+ return false;
+
+ if (chunk >= pool->addrs_cnt)
return false;
- }
+ if (desc->options)
+ return false;
return true;
}
-static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr,
- struct xdp_umem *umem)
+static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
{
- struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
- while (q->cached_cons != q->cached_prod) {
- u32 idx = q->cached_cons & q->ring_mask;
+ u64 addr, base_addr;
- *addr = ring->desc[idx] & q->chunk_mask;
+ base_addr = xp_unaligned_extract_addr(desc->addr);
+ addr = xp_unaligned_add_offset_to_addr(desc->addr);
- if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
- if (xskq_cons_is_valid_unaligned(q, *addr,
- umem->chunk_size_nohr,
- umem))
- return true;
- goto out;
- }
+ if (desc->len > pool->chunk_size)
+ return false;
- if (xskq_cons_is_valid_addr(q, *addr))
- return true;
+ if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+ xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+ return false;
-out:
- q->cached_cons++;
- }
+ if (desc->options)
+ return false;
+ return true;
+}
- return false;
+static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
+ xp_aligned_validate_desc(pool, desc);
}
static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
struct xdp_desc *d,
struct xdp_umem *umem)
{
- if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
- if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem))
- return false;
-
- if (d->len > umem->chunk_size_nohr || d->options) {
- q->invalid_descs++;
- return false;
- }
-
- return true;
- }
-
- if (!xskq_cons_is_valid_addr(q, d->addr))
- return false;
-
- if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
- d->options) {
+ if (!xp_validate_desc(umem->pool, d)) {
q->invalid_descs++;
return false;
}
-
return true;
}
return entries >= cnt;
}
-static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
- struct xdp_umem *umem)
+static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
{
if (q->cached_prod == q->cached_cons)
xskq_cons_get_entries(q);
- return xskq_cons_read_addr(q, addr, umem);
+ return xskq_cons_read_addr_unchecked(q, addr);
}
static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
return q ? q->invalid_descs : 0;
}
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
void xskq_destroy(struct xsk_queue *q_ops);
-/* Executed by the core when the entire UMEM gets freed */
-void xsk_reuseq_destroy(struct xdp_umem *umem);
-
#endif /* _LINUX_XSK_QUEUE_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "xsk.h"
+
+int xsk_map_inc(struct xsk_map *map)
+{
+ bpf_map_inc(&map->map);
+ return 0;
+}
+
+void xsk_map_put(struct xsk_map *map)
+{
+ bpf_map_put(&map->map);
+}
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *node;
+ int err;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+
+ err = xsk_map_inc(map);
+ if (err) {
+ kfree(node);
+ return ERR_PTR(err);
+ }
+
+ node->map = map;
+ node->map_entry = map_entry;
+ return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+ xsk_map_put(node->map);
+ kfree(node);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+ spin_lock_bh(&xs->map_list_lock);
+ list_add_tail(&node->node, &xs->map_list);
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *n, *tmp;
+
+ spin_lock_bh(&xs->map_list_lock);
+ list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+ if (map_entry == n->map_entry) {
+ list_del(&n->node);
+ xsk_map_node_free(n);
+ }
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map_memory mem;
+ int err, numa_node;
+ struct xsk_map *m;
+ u64 size;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+ return ERR_PTR(-EINVAL);
+
+ numa_node = bpf_map_attr_numa_node(attr);
+ size = struct_size(m, xsk_map, attr->max_entries);
+
+ err = bpf_map_charge_init(&mem, size);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ m = bpf_map_area_alloc(size, numa_node);
+ if (!m) {
+ bpf_map_charge_finish(&mem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bpf_map_init_from_attr(&m->map, attr);
+ bpf_map_charge_move(&m->map.memory, &mem);
+ spin_lock_init(&m->lock);
+
+ return &m->map;
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+ bpf_clear_redirect_map(map);
+ synchronize_net();
+ bpf_map_area_free(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= m->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == m->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
+ struct bpf_insn *insn = insn_buf;
+
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+ return insn - insn_buf;
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *xs, *old_xs, **map_entry;
+ u32 i = *(u32 *)key, fd = *(u32 *)value;
+ struct xsk_map_node *node;
+ struct socket *sock;
+ int err;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= m->map.max_entries))
+ return -E2BIG;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return err;
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ xs = (struct xdp_sock *)sock->sk;
+
+ if (!xsk_is_setup_for_bpf_map(xs)) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ map_entry = &m->xsk_map[i];
+ node = xsk_map_node_alloc(m, map_entry);
+ if (IS_ERR(node)) {
+ sockfd_put(sock);
+ return PTR_ERR(node);
+ }
+
+ spin_lock_bh(&m->lock);
+ old_xs = READ_ONCE(*map_entry);
+ if (old_xs == xs) {
+ err = 0;
+ goto out;
+ } else if (old_xs && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto out;
+ } else if (!old_xs && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto out;
+ }
+ xsk_map_sock_add(xs, node);
+ WRITE_ONCE(*map_entry, xs);
+ if (old_xs)
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
+ sockfd_put(sock);
+ return 0;
+
+out:
+ spin_unlock_bh(&m->lock);
+ sockfd_put(sock);
+ xsk_map_node_free(node);
+ return err;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *old_xs, **map_entry;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ spin_lock_bh(&m->lock);
+ map_entry = &m->xsk_map[k];
+ old_xs = xchg(map_entry, NULL);
+ if (old_xs)
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
+
+ return 0;
+}
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ spin_lock_bh(&map->lock);
+ if (READ_ONCE(*map_entry) == xs) {
+ WRITE_ONCE(*map_entry, NULL);
+ xsk_map_sock_delete(xs, map_entry);
+ }
+ spin_unlock_bh(&map->lock);
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+ .map_alloc = xsk_map_alloc,
+ .map_free = xsk_map_free,
+ .map_get_next_key = xsk_map_get_next_key,
+ .map_lookup_elem = xsk_map_lookup_elem,
+ .map_gen_lookup = xsk_map_gen_lookup,
+ .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
+ .map_update_elem = xsk_map_update_elem,
+ .map_delete_elem = xsk_map_delete_elem,
+ .map_check_btf = map_check_no_btf,
+};
xdp_sample_pkts
xdp_tx_iptunnel
xdpsock
+testfile.img
fds_example-objs := fds_example.o
sockex1-objs := sockex1_user.o
sockex2-objs := sockex2_user.o
-sockex3-objs := bpf_load.o sockex3_user.o
-tracex1-objs := bpf_load.o tracex1_user.o $(TRACE_HELPERS)
-tracex2-objs := bpf_load.o tracex2_user.o
-tracex3-objs := bpf_load.o tracex3_user.o
-tracex4-objs := bpf_load.o tracex4_user.o
-tracex5-objs := bpf_load.o tracex5_user.o $(TRACE_HELPERS)
-tracex6-objs := bpf_load.o tracex6_user.o
-tracex7-objs := bpf_load.o tracex7_user.o
+sockex3-objs := sockex3_user.o
+tracex1-objs := tracex1_user.o $(TRACE_HELPERS)
+tracex2-objs := tracex2_user.o
+tracex3-objs := tracex3_user.o
+tracex4-objs := tracex4_user.o
+tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
+tracex6-objs := tracex6_user.o
+tracex7-objs := tracex7_user.o
test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
lathist-objs := bpf_load.o lathist_user.o
#define MAX_IPS 8192
-struct bpf_map_def SEC("maps") ip_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(u64),
- .value_size = sizeof(u32),
- .max_entries = MAX_IPS,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u64);
+ __type(value, u32);
+ __uint(max_entries, MAX_IPS);
+} ip_map SEC(".maps");
SEC("perf_event")
int do_sample(struct bpf_perf_event_data *ctx)
#include "perf-sys.h"
#include "trace_helpers.h"
-#define __must_check
-#include <linux/err.h>
-
#define DEFAULT_FREQ 99
#define DEFAULT_SECS 5
#define MAX_IPS 8192
return 1;
}
links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
- if (IS_ERR(links[i])) {
+ if (libbpf_get_error(links[i])) {
fprintf(stderr, "ERROR: Attach perf event\n");
links[i] = NULL;
close(pmu_fd);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
- if (IS_ERR(obj)) {
+ if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
obj = NULL;
goto cleanup;
#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
-struct bpf_map_def SEC("maps") jmp_table = {
- .type = BPF_MAP_TYPE_PROG_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u32),
- .max_entries = 8,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 8);
+} jmp_table SEC(".maps");
#define PARSE_VLAN 1
#define PARSE_MPLS 2
struct flow_key_record flow;
};
-struct bpf_map_def SEC("maps") percpu_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(__u32),
- .value_size = sizeof(struct globals),
- .max_entries = 32,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, __u32);
+ __type(value, struct globals);
+ __uint(max_entries, 32);
+} percpu_map SEC(".maps");
/* user poor man's per_cpu until native support is ready */
static struct globals *this_cpu_globals(void)
__u64 bytes;
};
-struct bpf_map_def SEC("maps") hash_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(struct flow_key_record),
- .value_size = sizeof(struct pair),
- .max_entries = 1024,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct flow_key_record);
+ __type(value, struct pair);
+ __uint(max_entries, 1024);
+} hash_map SEC(".maps");
static void update_stats(struct __sk_buff *skb, struct globals *g)
{
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <assert.h>
-#include <linux/bpf.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/resource.h>
-#define PARSE_IP 3
-#define PARSE_IP_PROG_FD (prog_fd[0])
-#define PROG_ARRAY_FD (map_fd[0])
-
struct flow_key_record {
__be32 src;
__be32 dst;
int main(int argc, char **argv)
{
+ int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd;
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
+ const char *title;
FILE *f;
- int i, sock, err, id, key = PARSE_IP;
- struct bpf_prog_info info = {};
- uint32_t info_len = sizeof(info);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table");
+ hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
+ if (jmp_table_fd < 0 || hash_map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
}
- /* Test fd array lookup which returns the id of the bpf_prog */
- err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len);
- assert(!err);
- err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id);
- assert(!err);
- assert(id == info.id);
+ bpf_object__for_each_program(prog, obj) {
+ fd = bpf_program__fd(prog);
+
+ title = bpf_program__title(prog, false);
+ if (sscanf(title, "socket/%d", &key) != 1) {
+ fprintf(stderr, "ERROR: finding prog failed\n");
+ goto cleanup;
+ }
+
+ if (key == 0)
+ main_prog_fd = fd;
+ else
+ bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY);
+ }
sock = open_raw_sock("lo");
- assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4],
+ /* attach BPF program to socket */
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd,
sizeof(__u32)) == 0);
if (argc > 1)
sleep(1);
printf("IP src.port -> dst.port bytes packets\n");
- while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) {
- bpf_map_lookup_elem(map_fd[2], &next_key, &value);
+ while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(hash_map_fd, &next_key, &value);
printf("%s.%05d -> %s.%05d %12lld %12lld\n",
inet_ntoa((struct in_addr){htonl(next_key.src)}),
next_key.port16[0],
key = next_key;
}
}
+
+cleanup:
+ bpf_object__close(obj);
return 0;
}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __TRACE_COMMON_H
+#define __TRACE_COMMON_H
+
+#ifdef __x86_64__
+#define SYSCALL(SYS) "__x64_" __stringify(SYS)
+#elif defined(__s390x__)
+#define SYSCALL(SYS) "__s390x_" __stringify(SYS)
+#else
+#define SYSCALL(SYS) __stringify(SYS)
+#endif
+
+#endif
u32 userstack;
};
-struct bpf_map_def SEC("maps") counts = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(struct key_t),
- .value_size = sizeof(u64),
- .max_entries = 10000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct key_t);
+ __type(value, u64);
+ __uint(max_entries, 10000);
+} counts SEC(".maps");
-struct bpf_map_def SEC("maps") stackmap = {
- .type = BPF_MAP_TYPE_STACK_TRACE,
- .key_size = sizeof(u32),
- .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
- .max_entries = 10000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+ __uint(max_entries, 10000);
+} stackmap SEC(".maps");
#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
#include "perf-sys.h"
#include "trace_helpers.h"
-#define __must_check
-#include <linux/err.h>
-
#define SAMPLE_FREQ 50
static int pid;
goto all_cpu_err;
}
links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
- if (IS_ERR(links[i])) {
+ if (libbpf_get_error(links[i])) {
printf("bpf_program__attach_perf_event failed\n");
links[i] = NULL;
close(pmu_fd);
goto err;
}
link = bpf_program__attach_perf_event(prog, pmu_fd);
- if (IS_ERR(link)) {
+ if (libbpf_get_error(link)) {
printf("bpf_program__attach_perf_event failed\n");
link = NULL;
close(pmu_fd);
}
obj = bpf_object__open_file(filename, NULL);
- if (IS_ERR(obj)) {
+ if (libbpf_get_error(obj)) {
printf("opening BPF object file failed\n");
obj = NULL;
goto cleanup;
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
-#include <linux/bpf.h>
#include <unistd.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "trace_helpers.h"
int main(int ac, char **argv)
{
- FILE *f;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
+ FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
f = popen("taskset 1 ping -c5 localhost", "r");
read_trace_pipe();
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return 0;
}
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+#include "trace_common.h"
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(long),
- .max_entries = 1024,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, long);
+ __uint(max_entries, 1024);
+} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
u64 index;
};
-struct bpf_map_def SEC("maps") my_hist_map = {
- .type = BPF_MAP_TYPE_PERCPU_HASH,
- .key_size = sizeof(struct hist_key),
- .value_size = sizeof(long),
- .max_entries = 1024,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(struct hist_key));
+ __uint(value_size, sizeof(long));
+ __uint(max_entries, 1024);
+} my_hist_map SEC(".maps");
-SEC("kprobe/sys_write")
+SEC("kprobe/" SYSCALL(sys_write))
int bpf_prog3(struct pt_regs *ctx)
{
long write_size = PT_REGS_PARM3(ctx);
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
-#include <linux/bpf.h>
#include <string.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "bpf_util.h"
#define MAX_INDEX 64
#define MAX_STARS 38
+/* my_map, my_hist_map */
+static int map_fd[2];
+
static void stars(char *str, long val, long max, int width)
{
int i;
int main(int ac, char **argv)
{
struct rlimit r = {1024*1024, RLIM_INFINITY};
- char filename[256];
long key, next_key, value;
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ int i, j = 0;
FILE *f;
- int i;
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map");
+ if (map_fd[0] < 0 || map_fd[1] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
(void) f;
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
}
for (i = 0; i < 5; i++) {
}
print_hist(map_fd[1]);
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
return 0;
}
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(u64),
- .max_entries = 4096,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, u64);
+ __uint(max_entries, 4096);
+} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
#define SLOTS 100
-struct bpf_map_def SEC("maps") lat_map = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = SLOTS,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u64));
+ __uint(max_entries, SLOTS);
+} lat_map SEC(".maps");
SEC("kprobe/blk_account_io_completion")
int bpf_prog2(struct pt_regs *ctx)
#include <unistd.h>
#include <stdbool.h>
#include <string.h>
-#include <linux/bpf.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include "bpf_util.h"
#define SLOTS 100
int main(int ac, char **argv)
{
struct rlimit r = {1024*1024, RLIM_INFINITY};
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
- int i;
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
- if (setrlimit(RLIMIT_MEMLOCK, &r)) {
- perror("setrlimit(RLIMIT_MEMLOCK)");
- return 1;
- }
-
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
- }
+ int map_fd, i, j = 0;
for (i = 1; i < ac; i++) {
if (strcmp(argv[i], "-a") == 0) {
}
}
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
+
printf(" heatmap of IO latency\n");
if (text_only)
printf(" %s", sym[num_colors - 1]);
for (i = 0; ; i++) {
if (i % 20 == 0)
print_banner();
- print_hist(map_fd[1]);
+ print_hist(map_fd);
sleep(2);
}
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
return 0;
}
u64 ip;
};
-struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(long),
- .value_size = sizeof(struct pair),
- .max_entries = 1000000,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, struct pair);
+ __uint(max_entries, 1000000);
+} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
#include <stdbool.h>
#include <string.h>
#include <time.h>
-#include <linux/bpf.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
struct pair {
long long val;
key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
key = -1;
- while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
- bpf_map_lookup_elem(map_fd[0], &next_key, &v);
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &v);
key = next_key;
if (val - v.val < 1000000000ll)
/* object was allocated more then 1 sec ago */
int main(int ac, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
- int i;
-
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ int map_fd, i, j = 0;
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
return 1;
}
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
}
for (i = 0; ; i++) {
- print_old_objects(map_fd[1]);
+ print_old_objects(map_fd);
sleep(1);
}
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
return 0;
}
#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
-struct bpf_map_def SEC("maps") progs = {
- .type = BPF_MAP_TYPE_PROG_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u32),
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
#ifdef __mips__
- .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */
+ __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */
#else
- .max_entries = 1024,
+ __uint(max_entries, 1024);
#endif
-};
+} progs SEC(".maps");
SEC("kprobe/__seccomp_filter")
int bpf_prog1(struct pt_regs *ctx)
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
-#include <linux/bpf.h>
+#include <stdlib.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
#include <sys/resource.h>
#include "trace_helpers.h"
+#ifdef __mips__
+#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */
+#else
+#define MAX_ENTRIES 1024
+#endif
+
/* install fake seccomp program to enable seccomp code path inside the kernel,
* so that our kprobe attached to seccomp_phase1() can be triggered
*/
int main(int ac, char **argv)
{
- FILE *f;
- char filename[256];
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int key, fd, progs_fd;
+ char filename[256];
+ const char *title;
+ FILE *f;
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
+ }
+
+ progs_fd = bpf_object__find_map_fd_by_name(obj, "progs");
+ if (progs_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ title = bpf_program__title(prog, false);
+ /* register only syscalls to PROG_ARRAY */
+ if (sscanf(title, "kprobe/%d", &key) != 1)
+ continue;
+
+ fd = bpf_program__fd(prog);
+ bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY);
}
install_accept_all_seccomp();
read_trace_pipe();
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return 0;
}
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
-struct bpf_map_def SEC("maps") counters = {
- .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(u32),
- .max_entries = 64,
-};
-struct bpf_map_def SEC("maps") values = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(int),
- .value_size = sizeof(u64),
- .max_entries = 64,
-};
-struct bpf_map_def SEC("maps") values2 = {
- .type = BPF_MAP_TYPE_HASH,
- .key_size = sizeof(int),
- .value_size = sizeof(struct bpf_perf_event_value),
- .max_entries = 64,
-};
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 64);
+} counters SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, u64);
+ __uint(max_entries, 64);
+} values SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, struct bpf_perf_event_value);
+ __uint(max_entries, 64);
+} values2 SEC(".maps");
SEC("kprobe/htab_map_get_next_key")
int bpf_prog1(struct pt_regs *ctx)
#include <assert.h>
#include <fcntl.h>
#include <linux/perf_event.h>
-#include <linux/bpf.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
-#include "bpf_load.h"
#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
#include "perf-sys.h"
#define SAMPLE_PERIOD 0x7fffffffffffffffULL
+/* counters, values, values2 */
+static int map_fd[3];
+
static void check_on_cpu(int cpu, struct perf_event_attr *attr)
{
struct bpf_perf_event_value value2;
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
+ int i = 0;
+
+ setrlimit(RLIMIT_MEMLOCK, &r);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
- setrlimit(RLIMIT_MEMLOCK, &r);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values");
+ map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2");
+ if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
}
test_bpf_perf_event();
+
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
return 0;
}
#define _GNU_SOURCE
#include <stdio.h>
-#include <linux/bpf.h>
#include <unistd.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
int main(int argc, char **argv)
{
- FILE *f;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
char filename[256];
char command[256];
- int ret;
+ int ret = 0;
+ FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
- return 1;
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
}
snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
f = popen(command, "r");
ret = pclose(f);
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
return ret ? 0 : 1;
}
#include <time.h>
#include <linux/limits.h>
-#define __must_check
-#include <linux/err.h>
-
#include <arpa/inet.h>
#include <linux/if_link.h>
}
link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
- if (IS_ERR(link))
+ if (libbpf_get_error(link))
exit(EXIT_FAIL_BPF);
return link;
| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
| *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
| **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
-| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** |
-| **getsockopt** | **setsockopt** }
+| **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** |
+| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** }
| *ATTACH_FLAGS* := { **multi** | **override** }
DESCRIPTION
an unconnected udp6 socket (since 5.2);
**sysctl** sysctl access (since 5.2);
**getsockopt** call to getsockopt (since 5.3);
- **setsockopt** call to setsockopt (since 5.3).
+ **setsockopt** call to setsockopt (since 5.3);
+ **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8);
+ **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8);
+ **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8);
+ **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8).
**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
Detach *PROG* from the cgroup *CGROUP* and attach type
| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** |
| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** |
| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
-| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
+| **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** |
+| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
| **cgroup/getsockopt** | **cgroup/setsockopt** |
| **struct_ops** | **fentry** | **fexit** | **freplace**
lwt_seg6local sockops sk_skb sk_msg \
lirc_mode2 cgroup/bind4 cgroup/bind6 \
cgroup/connect4 cgroup/connect6 \
+ cgroup/getpeername4 cgroup/getpeername6 \
+ cgroup/getsockname4 cgroup/getsockname6 \
cgroup/sendmsg4 cgroup/sendmsg6 \
cgroup/recvmsg4 cgroup/recvmsg6 \
cgroup/post_bind4 cgroup/post_bind6 \
;;
attach|detach)
local ATTACH_TYPES='ingress egress sock_create sock_ops \
- device bind4 bind6 post_bind4 post_bind6 connect4 \
- connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \
- getsockopt setsockopt'
+ device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \
+ getpeername4 getpeername6 getsockname4 getsockname6 \
+ sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \
+ setsockopt'
local ATTACH_FLAGS='multi override'
local PROG_TYPE='id pinned tag name'
case $prev in
return 0
;;
ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
- post_bind4|post_bind6|connect4|connect6|sendmsg4|\
- sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\
- setsockopt)
+ post_bind4|post_bind6|connect4|connect6|getpeername4|\
+ getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\
+ recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt)
COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
"$cur" ) )
return 0
" ATTACH_TYPE := { ingress | egress | sock_create |\n" \
" sock_ops | device | bind4 | bind6 |\n" \
" post_bind4 | post_bind6 | connect4 |\n" \
- " connect6 | sendmsg4 | sendmsg6 |\n" \
- " recvmsg4 | recvmsg6 | sysctl |\n" \
- " getsockopt | setsockopt }"
+ " connect6 | getpeername4 | getpeername6 |\n" \
+ " getsockname4 | getsockname6 | sendmsg4 |\n" \
+ " sendmsg6 | recvmsg4 | recvmsg6 |\n" \
+ " sysctl | getsockopt | setsockopt }"
static unsigned int query_flags;
[BPF_CGROUP_INET6_CONNECT] = "connect6",
[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+ [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4",
+ [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6",
+ [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4",
+ [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6",
[BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
[BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
[BPF_CGROUP_SYSCTL] = "sysctl",
" sk_reuseport | flow_dissector | cgroup/sysctl |\n"
" cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
" cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
- " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n"
- " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n"
+ " cgroup/getpeername4 | cgroup/getpeername6 |\n"
+ " cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n"
+ " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n"
+ " cgroup/getsockopt | cgroup/setsockopt |\n"
" struct_ops | fentry | fexit | freplace }\n"
" ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
" flow_dissector }\n"
/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
struct bpf_lpm_trie_key {
__u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
- __u8 data[]; /* Arbitrary size */
+ __u8 data[0]; /* Arbitrary size */
};
struct bpf_cgroup_storage_key {
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ BPF_CGROUP_INET6_GETPEERNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};
* int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
- * only possible to shrink the packet as of this writing,
- * therefore *delta* must be a negative integer.
+ * possible to both shrink and grow the packet tail.
+ * Shrink done via *delta* being a negative integer.
*
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
void hashmap__clear(struct hashmap *map)
{
struct hashmap_entry *cur, *tmp;
- int bkt;
+ size_t bkt;
hashmap__for_each_entry_safe(map, cur, tmp, bkt) {
free(cur);
struct hashmap_entry **new_buckets;
struct hashmap_entry *cur, *tmp;
size_t new_cap_bits, new_cap;
- size_t h;
- int bkt;
+ size_t h, bkt;
new_cap_bits = map->cap_bits + 1;
if (new_cap_bits < HASHMAP_MIN_CAP_BITS)
#else
#include <bits/reg.h>
#endif
-#include "libbpf_internal.h"
static inline size_t hash_bits(size_t h, int bits)
{
BPF_CGROUP_UDP4_RECVMSG),
BPF_EAPROG_SEC("cgroup/recvmsg6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_CGROUP_UDP6_RECVMSG),
+ BPF_EAPROG_SEC("cgroup/getpeername4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET4_GETPEERNAME),
+ BPF_EAPROG_SEC("cgroup/getpeername6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET6_GETPEERNAME),
+ BPF_EAPROG_SEC("cgroup/getsockname4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET4_GETSOCKNAME),
+ BPF_EAPROG_SEC("cgroup/getsockname6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_CGROUP_INET6_GETSOCKNAME),
BPF_EAPROG_SEC("cgroup/sysctl", BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_CGROUP_SYSCTL),
BPF_EAPROG_SEC("cgroup/getsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT,
==================
BPF Selftest Notes
==================
+General instructions on running selftests can be found in
+`Documentation/bpf/bpf_devel_QA.rst`_.
Additional information about selftest failures are
documented here.
CONFIG_FTRACE_SYSCALLS=y
CONFIG_IPV6_TUNNEL=y
CONFIG_IPV6_GRE=y
+CONFIG_IPV6_SEG6_BPF=y
CONFIG_NET_FOU=m
CONFIG_NET_FOU_IP_TUNNELS=y
CONFIG_IPV6_FOU=m
CONFIG_BPF_JIT=y
CONFIG_BPF_LSM=y
CONFIG_SECURITY=y
+CONFIG_LIRC=y
#include <string.h>
#include <unistd.h>
+#include <arpa/inet.h>
+
#include <sys/epoll.h>
#include <linux/err.h>
.tcp.doff = 5,
};
-int start_server(int family, int type)
+int start_server_with_port(int family, int type, __u16 port)
{
struct sockaddr_storage addr = {};
socklen_t len;
struct sockaddr_in *sin = (void *)&addr;
sin->sin_family = AF_INET;
+ sin->sin_port = htons(port);
len = sizeof(*sin);
} else {
struct sockaddr_in6 *sin6 = (void *)&addr;
sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = htons(port);
len = sizeof(*sin6);
}
return fd;
}
+int start_server(int family, int type)
+{
+ return start_server_with_port(family, type, 0);
+}
+
static const struct timeval timeo_sec = { .tv_sec = 3 };
static const size_t timeo_optlen = sizeof(timeo_sec);
extern struct ipv6_packet pkt_v6;
int start_server(int family, int type);
+int start_server_with_port(int family, int type, __u16 port);
int connect_to_fd(int family, int type, int server_fd);
int connect_fd_to_fd(int client_fd, int server_fd);
int connect_wait(int client_fd);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+#define MAX_INSNS 512
+#define MAX_MATCHES 16
+
+struct bpf_reg_match {
+ unsigned int line;
+ const char *match;
+};
+
+struct bpf_align_test {
+ const char *descr;
+ struct bpf_insn insns[MAX_INSNS];
+ enum {
+ UNDEF,
+ ACCEPT,
+ REJECT
+ } result;
+ enum bpf_prog_type prog_type;
+ /* Matches must be in order of increasing line */
+ struct bpf_reg_match matches[MAX_MATCHES];
+};
+
+static struct bpf_align_test tests[] = {
+ /* Four tests of known constants. These aren't staggeringly
+ * interesting since we track exact values now.
+ */
+ {
+ .descr = "mov",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 16),
+ BPF_MOV64_IMM(BPF_REG_3, 32),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv2"},
+ {2, "R3_w=inv4"},
+ {3, "R3_w=inv8"},
+ {4, "R3_w=inv16"},
+ {5, "R3_w=inv32"},
+ },
+ },
+ {
+ .descr = "shift",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 32),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv1"},
+ {2, "R3_w=inv2"},
+ {3, "R3_w=inv4"},
+ {4, "R3_w=inv8"},
+ {5, "R3_w=inv16"},
+ {6, "R3_w=inv1"},
+ {7, "R4_w=inv32"},
+ {8, "R4_w=inv16"},
+ {9, "R4_w=inv8"},
+ {10, "R4_w=inv4"},
+ {11, "R4_w=inv2"},
+ },
+ },
+ {
+ .descr = "addsub",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv4"},
+ {2, "R3_w=inv8"},
+ {3, "R3_w=inv10"},
+ {4, "R4_w=inv8"},
+ {5, "R4_w=inv12"},
+ {6, "R4_w=inv14"},
+ },
+ },
+ {
+ .descr = "mul",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 7),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 2),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv7"},
+ {2, "R3_w=inv7"},
+ {3, "R3_w=inv14"},
+ {4, "R3_w=inv56"},
+ },
+ },
+
+ /* Tests using unknown values */
+#define PREP_PKT_POINTERS \
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \
+ offsetof(struct __sk_buff, data)), \
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \
+ offsetof(struct __sk_buff, data_end))
+
+#define LOAD_UNKNOWN(DST_REG) \
+ PREP_PKT_POINTERS, \
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), \
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), \
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 1), \
+ BPF_EXIT_INSN(), \
+ BPF_LDX_MEM(BPF_B, DST_REG, BPF_REG_2, 0)
+
+ {
+ .descr = "unknown shift",
+ .insns = {
+ LOAD_UNKNOWN(BPF_REG_3),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ LOAD_UNKNOWN(BPF_REG_4),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
+ {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+ {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+ {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+ {18, "R3=pkt_end(id=0,off=0,imm=0)"},
+ {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
+ {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+ {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+ {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+ },
+ },
+ {
+ .descr = "unknown mul",
+ .insns = {
+ LOAD_UNKNOWN(BPF_REG_3),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 1),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 4),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 8),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+ {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+ {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+ },
+ },
+ {
+ .descr = "packet const offset",
+ .insns = {
+ PREP_PKT_POINTERS,
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+
+ /* Skip over ethernet header. */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 3),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 0),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
+ {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
+ {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
+ {10, "R2=pkt(id=0,off=0,r=18,imm=0)"},
+ {10, "R5=pkt(id=0,off=14,r=18,imm=0)"},
+ {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
+ {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
+ },
+ },
+ {
+ .descr = "packet variable offset",
+ .insns = {
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+
+ /* First, add a constant to the R5 packet pointer,
+ * then a variable with a known alignment.
+ */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ /* Now, test in the other direction. Adding first
+ * the variable offset to R5, then the constant.
+ */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ /* Test multiple accumulations of unknown values
+ * into a packet pointer.
+ */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Offset is added to packet pointer R5, resulting in
+ * known fixed offset, and variable offset from R6.
+ */
+ {11, "R5_w=pkt(id=1,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* At the time the word size load is performed from R5,
+ * it's total offset is NET_IP_ALIGN + reg->off (0) +
+ * reg->aux_off (14) which is 16. Then the variable
+ * offset is considered using reg->aux_off_align which
+ * is 4 and meets the load's requirements.
+ */
+ {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Variable offset is added to R5 packet pointer,
+ * resulting in auxiliary alignment of 4.
+ */
+ {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant offset is added to R5, resulting in
+ * reg->off of 14.
+ */
+ {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off
+ * (14) which is 16. Then the variable offset is 4-byte
+ * aligned, so the total offset is 4-byte aligned and
+ * meets the load's requirements.
+ */
+ {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant offset is added to R5 packet pointer,
+ * resulting in reg->off value of 14.
+ */
+ {26, "R5_w=pkt(id=0,off=14,r=8"},
+ /* Variable offset is added to R5, resulting in a
+ * variable offset of (4n).
+ */
+ {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant is added to R5 again, setting reg->off to 18. */
+ {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* And once more we add a variable; resulting var_off
+ * is still (4n), fixed offset is not changed.
+ * Also, we create a new reg->id.
+ */
+ {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (18)
+ * which is 20. Then the variable offset is (4n), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ },
+ },
+ {
+ .descr = "packet variable offset 2",
+ .insns = {
+ /* Create an unknown offset, (4n+2)-aligned */
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+ /* Add it to the packet pointer */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ /* Make a (4n) offset from the value we just read */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xff),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ /* Add it to the packet pointer */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Adding 14 makes R6 be (4n+2) */
+ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ /* Packet pointer has (4n+2) offset */
+ {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+ {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+ /* Newly read value in R6 was shifted left by 2, so has
+ * known alignment of 4.
+ */
+ {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Added (4n) to packet pointer's (4n+2) var_off, giving
+ * another (4n+2).
+ */
+ {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+ {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+ },
+ },
+ {
+ .descr = "dubious pointer arithmetic",
+ .insns = {
+ PREP_PKT_POINTERS,
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* (ptr - ptr) << 2 */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2),
+ /* We have a (4n) value. Let's make a packet offset
+ * out of it. First add 14, to make it a (4n+2)
+ */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ /* Then make sure it's nonnegative */
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_5, 0, 1),
+ BPF_EXIT_INSN(),
+ /* Add it to packet pointer */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .matches = {
+ {4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
+ /* (ptr - ptr) << 2 == unknown, (4n) */
+ {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"},
+ /* (4n) + 14 == (4n+2). We blow our bounds, because
+ * the add could overflow.
+ */
+ {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+ /* Checked s>=0 */
+ {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+ /* packet pointer + nonnegative (4n+2) */
+ {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+ {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+ /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
+ * We checked the bounds, but it might have been able
+ * to overflow if the packet pointer started in the
+ * upper half of the address space.
+ * So we did not get a 'range' on R6, and the access
+ * attempt will fail.
+ */
+ {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+ }
+ },
+ {
+ .descr = "variable subtraction",
+ .insns = {
+ /* Create an unknown offset, (4n+2)-aligned */
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+ /* Create another unknown, (4n)-aligned, and subtract
+ * it from the first one
+ */
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_7),
+ /* Bounds-check the result */
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_6, 0, 1),
+ BPF_EXIT_INSN(),
+ /* Add it to the packet pointer */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Adding 14 makes R6 be (4n+2) */
+ {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ /* New unknown value in R7 is (4n) */
+ {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Subtracting it from R6 blows our unsigned bounds */
+ {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+ /* Checked s>= 0 */
+ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
+
+ },
+ },
+ {
+ .descr = "pointer variable subtraction",
+ .insns = {
+ /* Create an unknown offset, (4n+2)-aligned and bounded
+ * to [14,74]
+ */
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xf),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+ /* Subtract it from the packet pointer */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_6),
+ /* Create another unknown, (4n)-aligned and >= 74.
+ * That in fact means >= 76, since 74 % 4 == 2
+ */
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 76),
+ /* Add it to the packet pointer */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_7),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
+ /* Adding 14 makes R6 be (4n+2) */
+ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
+ /* Subtracting from packet pointer overflows ubounds */
+ {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
+ /* New unknown value in R7 is (4n), >= 76 */
+ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
+ /* Adding it to packet pointer gives nice bounds again */
+ {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+ },
+ },
+};
+
+static int probe_filter_length(const struct bpf_insn *fp)
+{
+ int len;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+ return len + 1;
+}
+
+static char bpf_vlog[32768];
+
+static int do_test_single(struct bpf_align_test *test)
+{
+ struct bpf_insn *prog = test->insns;
+ int prog_type = test->prog_type;
+ char bpf_vlog_copy[32768];
+ const char *line_ptr;
+ int cur_line = -1;
+ int prog_len, i;
+ int fd_prog;
+ int ret;
+
+ prog_len = probe_filter_length(prog);
+ fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
+ prog, prog_len, BPF_F_STRICT_ALIGNMENT,
+ "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
+ if (fd_prog < 0 && test->result != REJECT) {
+ printf("Failed to load program.\n");
+ printf("%s", bpf_vlog);
+ ret = 1;
+ } else if (fd_prog >= 0 && test->result == REJECT) {
+ printf("Unexpected success to load!\n");
+ printf("%s", bpf_vlog);
+ ret = 1;
+ close(fd_prog);
+ } else {
+ ret = 0;
+ /* We make a local copy so that we can strtok() it */
+ strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy));
+ line_ptr = strtok(bpf_vlog_copy, "\n");
+ for (i = 0; i < MAX_MATCHES; i++) {
+ struct bpf_reg_match m = test->matches[i];
+
+ if (!m.match)
+ break;
+ while (line_ptr) {
+ cur_line = -1;
+ sscanf(line_ptr, "%u: ", &cur_line);
+ if (cur_line == m.line)
+ break;
+ line_ptr = strtok(NULL, "\n");
+ }
+ if (!line_ptr) {
+ printf("Failed to find line %u for match: %s\n",
+ m.line, m.match);
+ ret = 1;
+ printf("%s", bpf_vlog);
+ break;
+ }
+ if (!strstr(line_ptr, m.match)) {
+ printf("Failed to find match %u: %s\n",
+ m.line, m.match);
+ ret = 1;
+ printf("%s", bpf_vlog);
+ break;
+ }
+ }
+ if (fd_prog >= 0)
+ close(fd_prog);
+ }
+ return ret;
+}
+
+void test_align(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct bpf_align_test *test = &tests[i];
+
+ if (!test__start_subtest(test->descr))
+ continue;
+
+ CHECK_FAIL(do_test_single(test));
+ }
+}
#include "cgroup_helpers.h"
#include "network_helpers.h"
-static int verify_port(int family, int fd, int expected)
+static int verify_ports(int family, int fd,
+ __u16 expected_local, __u16 expected_peer)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
else
port = ((struct sockaddr_in6 *)&addr)->sin6_port;
- if (ntohs(port) != expected) {
- log_err("Unexpected port %d, expected %d", ntohs(port),
- expected);
+ if (ntohs(port) != expected_local) {
+ log_err("Unexpected local port %d, expected %d", ntohs(port),
+ expected_local);
+ return -1;
+ }
+
+ if (getpeername(fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get peer addr");
+ return -1;
+ }
+
+ if (family == AF_INET)
+ port = ((struct sockaddr_in *)&addr)->sin_port;
+ else
+ port = ((struct sockaddr_in6 *)&addr)->sin6_port;
+
+ if (ntohs(port) != expected_peer) {
+ log_err("Unexpected peer port %d, expected %d", ntohs(port),
+ expected_peer);
return -1;
}
static int run_test(int cgroup_fd, int server_fd, int family, int type)
{
+ bool v4 = family == AF_INET;
+ __u16 expected_local_port = v4 ? 22222 : 22223;
+ __u16 expected_peer_port = 60000;
struct bpf_prog_load_attr attr = {
- .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ .file = v4 ? "./connect_force_port4.o" :
+ "./connect_force_port6.o",
};
+ struct bpf_program *prog;
struct bpf_object *obj;
- int expected_port;
- int prog_fd;
- int err;
- int fd;
-
- if (family == AF_INET) {
- attr.file = "./connect_force_port4.o";
- attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
- expected_port = 22222;
- } else {
- attr.file = "./connect_force_port6.o";
- attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT;
- expected_port = 22223;
- }
+ int xlate_fd, fd, err;
+ __u32 duration = 0;
- err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+ err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd);
if (err) {
log_err("Failed to load BPF object");
return -1;
}
- err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type,
- 0);
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/connect4" :
+ "cgroup/connect6");
+ if (CHECK(!prog, "find_prog", "connect prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_CONNECT :
+ BPF_CGROUP_INET6_CONNECT, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/getpeername4" :
+ "cgroup/getpeername6");
+ if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_GETPEERNAME :
+ BPF_CGROUP_INET6_GETPEERNAME, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/getsockname4" :
+ "cgroup/getsockname6");
+ if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_GETSOCKNAME :
+ BPF_CGROUP_INET6_GETSOCKNAME, 0);
if (err) {
log_err("Failed to attach BPF program");
goto close_bpf_object;
goto close_bpf_object;
}
- err = verify_port(family, fd, expected_port);
-
+ err = verify_ports(family, fd, expected_local_port,
+ expected_peer_port);
close(fd);
close_bpf_object:
if (CHECK_FAIL(cgroup_fd < 0))
return;
- server_fd = start_server(AF_INET, SOCK_STREAM);
+ server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM));
close(server_fd);
- server_fd = start_server(AF_INET6, SOCK_STREAM);
+ server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM));
close(server_fd);
- server_fd = start_server(AF_INET, SOCK_DGRAM);
+ server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM));
close(server_fd);
- server_fd = start_server(AF_INET6, SOCK_DGRAM);
+ server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM));
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__bpf_map
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+} __attribute__((preserve_access_index));
+
SEC("iter/bpf_map")
int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
{
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__ipv6_route
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__ipv6_route {
+ struct bpf_iter_meta *meta;
+ struct fib6_info *rt;
+} __attribute__((preserve_access_index));
+
char _license[] SEC("license") = "GPL";
extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__netlink bpf_iter__netlink___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__netlink
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#define sk_rmem_alloc sk_backlog.rmem_alloc
#define sk_refcnt __sk_common.skc_refcnt
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__netlink {
+ struct bpf_iter_meta *meta;
+ struct netlink_sock *sk;
+} __attribute__((preserve_access_index));
+
static inline struct inode *SOCK_INODE(struct socket *socket)
{
return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task_file bpf_iter__task_file___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task_file
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task_file {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+ __u32 fd;
+ struct file *file;
+} __attribute__((preserve_access_index));
+
SEC("iter/task_file")
int dump_task_file(struct bpf_iter__task_file *ctx)
{
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__bpf_map
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+} __attribute__((preserve_access_index));
+
__u32 map1_id = 0, map2_id = 0;
__u32 map1_accessed = 0, map2_accessed = 0;
__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0;
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
int count = 0;
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{
// SPDX-License-Identifier: GPL-2.0
#include <string.h>
+#include <stdbool.h>
#include <linux/bpf.h>
#include <linux/in.h>
char _license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
+struct svc_addr {
+ __be32 addr;
+ __be16 port;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
SEC("cgroup/connect4")
-int _connect4(struct bpf_sock_addr *ctx)
+int connect4(struct bpf_sock_addr *ctx)
{
struct sockaddr_in sa = {};
+ struct svc_addr *orig;
+ /* Force local address to 127.0.0.1:22222. */
sa.sin_family = AF_INET;
sa.sin_port = bpf_htons(22222);
- sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */
+ sa.sin_addr.s_addr = bpf_htonl(0x7f000001);
if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
return 0;
+ /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */
+ if (ctx->user_port == bpf_htons(60000)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!orig)
+ return 0;
+
+ orig->addr = ctx->user_ip4;
+ orig->port = ctx->user_port;
+
+ ctx->user_ip4 = bpf_htonl(0x7f000001);
+ ctx->user_port = bpf_htons(60123);
+ }
+ return 1;
+}
+
+SEC("cgroup/getsockname4")
+int getsockname4(struct bpf_sock_addr *ctx)
+{
+ /* Expose local server as 1.2.3.4:60000 to client. */
+ if (ctx->user_port == bpf_htons(60123)) {
+ ctx->user_ip4 = bpf_htonl(0x01020304);
+ ctx->user_port = bpf_htons(60000);
+ }
+ return 1;
+}
+
+SEC("cgroup/getpeername4")
+int getpeername4(struct bpf_sock_addr *ctx)
+{
+ struct svc_addr *orig;
+
+ /* Expose service 1.2.3.4:60000 as peer instead of backend. */
+ if (ctx->user_port == bpf_htons(60123)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+ if (orig) {
+ ctx->user_ip4 = orig->addr;
+ ctx->user_port = orig->port;
+ }
+ }
return 1;
}
char _license[] SEC("license") = "GPL";
int _version SEC("version") = 1;
+struct svc_addr {
+ __be32 addr[4];
+ __be16 port;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
SEC("cgroup/connect6")
-int _connect6(struct bpf_sock_addr *ctx)
+int connect6(struct bpf_sock_addr *ctx)
{
struct sockaddr_in6 sa = {};
+ struct svc_addr *orig;
+ /* Force local address to [::1]:22223. */
sa.sin6_family = AF_INET6;
sa.sin6_port = bpf_htons(22223);
- sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */
+ sa.sin6_addr.s6_addr32[3] = bpf_htonl(1);
if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
return 0;
+ /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */
+ if (ctx->user_port == bpf_htons(60000)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!orig)
+ return 0;
+
+ orig->addr[0] = ctx->user_ip6[0];
+ orig->addr[1] = ctx->user_ip6[1];
+ orig->addr[2] = ctx->user_ip6[2];
+ orig->addr[3] = ctx->user_ip6[3];
+ orig->port = ctx->user_port;
+
+ ctx->user_ip6[0] = 0;
+ ctx->user_ip6[1] = 0;
+ ctx->user_ip6[2] = 0;
+ ctx->user_ip6[3] = bpf_htonl(1);
+ ctx->user_port = bpf_htons(60124);
+ }
+ return 1;
+}
+
+SEC("cgroup/getsockname6")
+int getsockname6(struct bpf_sock_addr *ctx)
+{
+ /* Expose local server as [fc00::1]:60000 to client. */
+ if (ctx->user_port == bpf_htons(60124)) {
+ ctx->user_ip6[0] = bpf_htonl(0xfc000000);
+ ctx->user_ip6[1] = 0;
+ ctx->user_ip6[2] = 0;
+ ctx->user_ip6[3] = bpf_htonl(1);
+ ctx->user_port = bpf_htons(60000);
+ }
+ return 1;
+}
+
+SEC("cgroup/getpeername6")
+int getpeername6(struct bpf_sock_addr *ctx)
+{
+ struct svc_addr *orig;
+
+ /* Expose service [fc00::1]:60000 as peer instead of backend. */
+ if (ctx->user_port == bpf_htons(60124)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+ if (orig) {
+ ctx->user_ip6[0] = orig->addr[0];
+ ctx->user_ip6[1] = orig->addr[1];
+ ctx->user_ip6[2] = orig->addr[2];
+ ctx->user_ip6[3] = orig->addr[3];
+ ctx->user_port = orig->port;
+ }
+ }
return 1;
}
tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+ bpf_printk("sk=%d\n", sk ? 1 : 0);
if (sk)
bpf_sk_release(sk);
return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* Sockmap sample program connects a client and a backend together
+ * using cgroups.
+ *
+ * client:X <---> frontend:80 client:X <---> backend:80
+ *
+ * For simplicity we hard code values here and bind 1:1. The hard
+ * coded values are part of the setup in sockmap.sh script that
+ * is associated with this BPF program.
+ *
+ * The bpf_printk is verbose and prints information as connections
+ * are established and verdicts are decided.
+ */
+
+struct {
+ __uint(type, TEST_MAP_TYPE);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map SEC(".maps");
+
+struct {
+ __uint(type, TEST_MAP_TYPE);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map_txmsg SEC(".maps");
+
+struct {
+ __uint(type, TEST_MAP_TYPE);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map_redir SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_apply_bytes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_cork_bytes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 6);
+ __type(key, int);
+ __type(value, int);
+} sock_bytes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_redir_flags SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_skb_opts SEC(".maps");
+
+SEC("sk_skb1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+SEC("sk_skb2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+ __u32 lport = skb->local_port;
+ __u32 rport = skb->remote_port;
+ int len, *f, ret, zero = 0;
+ __u64 flags = 0;
+
+ if (lport == 10000)
+ ret = 10;
+ else
+ ret = 1;
+
+ len = (__u32)skb->data_end - (__u32)skb->data;
+ f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
+ if (f && *f) {
+ ret = 3;
+ flags = *f;
+ }
+
+#ifdef SOCKMAP
+ return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
+#else
+ return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags);
+#endif
+
+}
+
+SEC("sockops")
+int bpf_sockmap(struct bpf_sock_ops *skops)
+{
+ __u32 lport, rport;
+ int op, err = 0, index, key, ret;
+
+
+ op = (int) skops->op;
+
+ switch (op) {
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ lport = skops->local_port;
+ rport = skops->remote_port;
+
+ if (lport == 10000) {
+ ret = 1;
+#ifdef SOCKMAP
+ err = bpf_sock_map_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#else
+ err = bpf_sock_hash_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#endif
+ }
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ lport = skops->local_port;
+ rport = skops->remote_port;
+
+ if (bpf_ntohl(rport) == 10001) {
+ ret = 10;
+#ifdef SOCKMAP
+ err = bpf_sock_map_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#else
+ err = bpf_sock_hash_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#endif
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+SEC("sk_msg1")
+int bpf_prog4(struct sk_msg_md *msg)
+{
+ int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+ int *start, *end, *start_push, *end_push, *start_pop, *pop;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes)
+ bpf_msg_apply_bytes(msg, *bytes);
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes)
+ bpf_msg_cork_bytes(msg, *bytes);
+ start = bpf_map_lookup_elem(&sock_bytes, &zero);
+ end = bpf_map_lookup_elem(&sock_bytes, &one);
+ if (start && end)
+ bpf_msg_pull_data(msg, *start, *end, 0);
+ start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+ end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+ if (start_push && end_push)
+ bpf_msg_push_data(msg, *start_push, *end_push, 0);
+ start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+ pop = bpf_map_lookup_elem(&sock_bytes, &five);
+ if (start_pop && pop)
+ bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+ return SK_PASS;
+}
+
+SEC("sk_msg2")
+int bpf_prog6(struct sk_msg_md *msg)
+{
+ int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
+ int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
+ __u64 flags = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes)
+ bpf_msg_apply_bytes(msg, *bytes);
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes)
+ bpf_msg_cork_bytes(msg, *bytes);
+
+ start = bpf_map_lookup_elem(&sock_bytes, &zero);
+ end = bpf_map_lookup_elem(&sock_bytes, &one);
+ if (start && end)
+ bpf_msg_pull_data(msg, *start, *end, 0);
+
+ start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+ end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+ if (start_push && end_push)
+ bpf_msg_push_data(msg, *start_push, *end_push, 0);
+
+ start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+ pop = bpf_map_lookup_elem(&sock_bytes, &five);
+ if (start_pop && pop)
+ bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+
+ f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+ if (f && *f) {
+ key = 2;
+ flags = *f;
+ }
+#ifdef SOCKMAP
+ return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+#else
+ return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
+#endif
+}
+
+SEC("sk_msg3")
+int bpf_prog8(struct sk_msg_md *msg)
+{
+ void *data_end = (void *)(long) msg->data_end;
+ void *data = (void *)(long) msg->data;
+ int ret = 0, *bytes, zero = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes) {
+ ret = bpf_msg_apply_bytes(msg, *bytes);
+ if (ret)
+ return SK_DROP;
+ } else {
+ return SK_DROP;
+ }
+ return SK_PASS;
+}
+SEC("sk_msg4")
+int bpf_prog9(struct sk_msg_md *msg)
+{
+ void *data_end = (void *)(long) msg->data_end;
+ void *data = (void *)(long) msg->data;
+ int ret = 0, *bytes, zero = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes) {
+ if (((__u64)data_end - (__u64)data) >= *bytes)
+ return SK_PASS;
+ ret = bpf_msg_cork_bytes(msg, *bytes);
+ if (ret)
+ return SK_DROP;
+ }
+ return SK_PASS;
+}
+
+SEC("sk_msg5")
+int bpf_prog10(struct sk_msg_md *msg)
+{
+ int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
+ int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes)
+ bpf_msg_apply_bytes(msg, *bytes);
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes)
+ bpf_msg_cork_bytes(msg, *bytes);
+ start = bpf_map_lookup_elem(&sock_bytes, &zero);
+ end = bpf_map_lookup_elem(&sock_bytes, &one);
+ if (start && end)
+ bpf_msg_pull_data(msg, *start, *end, 0);
+ start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+ end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+ if (start_push && end_push)
+ bpf_msg_push_data(msg, *start_push, *end_push, 0);
+ start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+ pop = bpf_map_lookup_elem(&sock_bytes, &five);
+ if (start_pop && pop)
+ bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+ return SK_DROP;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+++ /dev/null
-#include <asm/types.h>
-#include <linux/types.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <stddef.h>
-#include <stdbool.h>
-
-#include <linux/unistd.h>
-#include <linux/filter.h>
-#include <linux/bpf_perf_event.h>
-#include <linux/bpf.h>
-
-#include <bpf/bpf.h>
-
-#include "../../../include/linux/filter.h"
-#include "bpf_rlimit.h"
-#include "bpf_util.h"
-
-#define MAX_INSNS 512
-#define MAX_MATCHES 16
-
-struct bpf_reg_match {
- unsigned int line;
- const char *match;
-};
-
-struct bpf_align_test {
- const char *descr;
- struct bpf_insn insns[MAX_INSNS];
- enum {
- UNDEF,
- ACCEPT,
- REJECT
- } result;
- enum bpf_prog_type prog_type;
- /* Matches must be in order of increasing line */
- struct bpf_reg_match matches[MAX_MATCHES];
-};
-
-static struct bpf_align_test tests[] = {
- /* Four tests of known constants. These aren't staggeringly
- * interesting since we track exact values now.
- */
- {
- .descr = "mov",
- .insns = {
- BPF_MOV64_IMM(BPF_REG_3, 2),
- BPF_MOV64_IMM(BPF_REG_3, 4),
- BPF_MOV64_IMM(BPF_REG_3, 8),
- BPF_MOV64_IMM(BPF_REG_3, 16),
- BPF_MOV64_IMM(BPF_REG_3, 32),
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- {1, "R1=ctx(id=0,off=0,imm=0)"},
- {1, "R10=fp0"},
- {1, "R3_w=inv2"},
- {2, "R3_w=inv4"},
- {3, "R3_w=inv8"},
- {4, "R3_w=inv16"},
- {5, "R3_w=inv32"},
- },
- },
- {
- .descr = "shift",
- .insns = {
- BPF_MOV64_IMM(BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 4),
- BPF_MOV64_IMM(BPF_REG_4, 32),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- {1, "R1=ctx(id=0,off=0,imm=0)"},
- {1, "R10=fp0"},
- {1, "R3_w=inv1"},
- {2, "R3_w=inv2"},
- {3, "R3_w=inv4"},
- {4, "R3_w=inv8"},
- {5, "R3_w=inv16"},
- {6, "R3_w=inv1"},
- {7, "R4_w=inv32"},
- {8, "R4_w=inv16"},
- {9, "R4_w=inv8"},
- {10, "R4_w=inv4"},
- {11, "R4_w=inv2"},
- },
- },
- {
- .descr = "addsub",
- .insns = {
- BPF_MOV64_IMM(BPF_REG_3, 4),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 4),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 2),
- BPF_MOV64_IMM(BPF_REG_4, 8),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2),
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- {1, "R1=ctx(id=0,off=0,imm=0)"},
- {1, "R10=fp0"},
- {1, "R3_w=inv4"},
- {2, "R3_w=inv8"},
- {3, "R3_w=inv10"},
- {4, "R4_w=inv8"},
- {5, "R4_w=inv12"},
- {6, "R4_w=inv14"},
- },
- },
- {
- .descr = "mul",
- .insns = {
- BPF_MOV64_IMM(BPF_REG_3, 7),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 2),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 4),
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- {1, "R1=ctx(id=0,off=0,imm=0)"},
- {1, "R10=fp0"},
- {1, "R3_w=inv7"},
- {2, "R3_w=inv7"},
- {3, "R3_w=inv14"},
- {4, "R3_w=inv56"},
- },
- },
-
- /* Tests using unknown values */
-#define PREP_PKT_POINTERS \
- BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \
- offsetof(struct __sk_buff, data)), \
- BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \
- offsetof(struct __sk_buff, data_end))
-
-#define LOAD_UNKNOWN(DST_REG) \
- PREP_PKT_POINTERS, \
- BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), \
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), \
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 1), \
- BPF_EXIT_INSN(), \
- BPF_LDX_MEM(BPF_B, DST_REG, BPF_REG_2, 0)
-
- {
- .descr = "unknown shift",
- .insns = {
- LOAD_UNKNOWN(BPF_REG_3),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
- LOAD_UNKNOWN(BPF_REG_4),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
- {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
- {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
- {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
- {18, "R3=pkt_end(id=0,off=0,imm=0)"},
- {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
- {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
- {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
- {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
- },
- },
- {
- .descr = "unknown mul",
- .insns = {
- LOAD_UNKNOWN(BPF_REG_3),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 1),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 4),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 8),
- BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
- {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
- {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
- },
- },
- {
- .descr = "packet const offset",
- .insns = {
- PREP_PKT_POINTERS,
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-
- BPF_MOV64_IMM(BPF_REG_0, 0),
-
- /* Skip over ethernet header. */
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
-
- BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 0),
- BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 1),
- BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 2),
- BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 3),
- BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 0),
- BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 2),
- BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
- {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
- {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
- {10, "R2=pkt(id=0,off=0,r=18,imm=0)"},
- {10, "R5=pkt(id=0,off=14,r=18,imm=0)"},
- {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
- {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
- {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
- },
- },
- {
- .descr = "packet variable offset",
- .insns = {
- LOAD_UNKNOWN(BPF_REG_6),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
-
- /* First, add a constant to the R5 packet pointer,
- * then a variable with a known alignment.
- */
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
- /* Now, test in the other direction. Adding first
- * the variable offset to R5, then the constant.
- */
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
- /* Test multiple accumulations of unknown values
- * into a packet pointer.
- */
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4),
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- /* Calculated offset in R6 has unknown value, but known
- * alignment of 4.
- */
- {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
- {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Offset is added to packet pointer R5, resulting in
- * known fixed offset, and variable offset from R6.
- */
- {11, "R5_w=pkt(id=1,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* At the time the word size load is performed from R5,
- * it's total offset is NET_IP_ALIGN + reg->off (0) +
- * reg->aux_off (14) which is 16. Then the variable
- * offset is considered using reg->aux_off_align which
- * is 4 and meets the load's requirements.
- */
- {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
- {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Variable offset is added to R5 packet pointer,
- * resulting in auxiliary alignment of 4.
- */
- {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Constant offset is added to R5, resulting in
- * reg->off of 14.
- */
- {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* At the time the word size load is performed from R5,
- * its total fixed offset is NET_IP_ALIGN + reg->off
- * (14) which is 16. Then the variable offset is 4-byte
- * aligned, so the total offset is 4-byte aligned and
- * meets the load's requirements.
- */
- {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
- {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Constant offset is added to R5 packet pointer,
- * resulting in reg->off value of 14.
- */
- {26, "R5_w=pkt(id=0,off=14,r=8"},
- /* Variable offset is added to R5, resulting in a
- * variable offset of (4n).
- */
- {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Constant is added to R5 again, setting reg->off to 18. */
- {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* And once more we add a variable; resulting var_off
- * is still (4n), fixed offset is not changed.
- * Also, we create a new reg->id.
- */
- {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"},
- /* At the time the word size load is performed from R5,
- * its total fixed offset is NET_IP_ALIGN + reg->off (18)
- * which is 20. Then the variable offset is (4n), so
- * the total offset is 4-byte aligned and meets the
- * load's requirements.
- */
- {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
- {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
- },
- },
- {
- .descr = "packet variable offset 2",
- .insns = {
- /* Create an unknown offset, (4n+2)-aligned */
- LOAD_UNKNOWN(BPF_REG_6),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
- /* Add it to the packet pointer */
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
- /* Check bounds and perform a read */
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
- /* Make a (4n) offset from the value we just read */
- BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xff),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
- /* Add it to the packet pointer */
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
- /* Check bounds and perform a read */
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
- BPF_MOV64_IMM(BPF_REG_0, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- /* Calculated offset in R6 has unknown value, but known
- * alignment of 4.
- */
- {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
- {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Adding 14 makes R6 be (4n+2) */
- {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
- /* Packet pointer has (4n+2) offset */
- {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
- {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
- /* At the time the word size load is performed from R5,
- * its total fixed offset is NET_IP_ALIGN + reg->off (0)
- * which is 2. Then the variable offset is (4n+2), so
- * the total offset is 4-byte aligned and meets the
- * load's requirements.
- */
- {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
- /* Newly read value in R6 was shifted left by 2, so has
- * known alignment of 4.
- */
- {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Added (4n) to packet pointer's (4n+2) var_off, giving
- * another (4n+2).
- */
- {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
- {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
- /* At the time the word size load is performed from R5,
- * its total fixed offset is NET_IP_ALIGN + reg->off (0)
- * which is 2. Then the variable offset is (4n+2), so
- * the total offset is 4-byte aligned and meets the
- * load's requirements.
- */
- {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
- },
- },
- {
- .descr = "dubious pointer arithmetic",
- .insns = {
- PREP_PKT_POINTERS,
- BPF_MOV64_IMM(BPF_REG_0, 0),
- /* (ptr - ptr) << 2 */
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
- BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2),
- /* We have a (4n) value. Let's make a packet offset
- * out of it. First add 14, to make it a (4n+2)
- */
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
- /* Then make sure it's nonnegative */
- BPF_JMP_IMM(BPF_JSGE, BPF_REG_5, 0, 1),
- BPF_EXIT_INSN(),
- /* Add it to packet pointer */
- BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
- BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
- /* Check bounds and perform a read */
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_6),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_6, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .result = REJECT,
- .matches = {
- {4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
- /* (ptr - ptr) << 2 == unknown, (4n) */
- {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"},
- /* (4n) + 14 == (4n+2). We blow our bounds, because
- * the add could overflow.
- */
- {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"},
- /* Checked s>=0 */
- {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
- /* packet pointer + nonnegative (4n+2) */
- {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
- {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
- /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
- * We checked the bounds, but it might have been able
- * to overflow if the packet pointer started in the
- * upper half of the address space.
- * So we did not get a 'range' on R6, and the access
- * attempt will fail.
- */
- {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
- }
- },
- {
- .descr = "variable subtraction",
- .insns = {
- /* Create an unknown offset, (4n+2)-aligned */
- LOAD_UNKNOWN(BPF_REG_6),
- BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
- /* Create another unknown, (4n)-aligned, and subtract
- * it from the first one
- */
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
- BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_7),
- /* Bounds-check the result */
- BPF_JMP_IMM(BPF_JSGE, BPF_REG_6, 0, 1),
- BPF_EXIT_INSN(),
- /* Add it to the packet pointer */
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
- /* Check bounds and perform a read */
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- /* Calculated offset in R6 has unknown value, but known
- * alignment of 4.
- */
- {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
- {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Adding 14 makes R6 be (4n+2) */
- {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
- /* New unknown value in R7 is (4n) */
- {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
- /* Subtracting it from R6 blows our unsigned bounds */
- {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"},
- /* Checked s>= 0 */
- {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
- /* At the time the word size load is performed from R5,
- * its total fixed offset is NET_IP_ALIGN + reg->off (0)
- * which is 2. Then the variable offset is (4n+2), so
- * the total offset is 4-byte aligned and meets the
- * load's requirements.
- */
- {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
- },
- },
- {
- .descr = "pointer variable subtraction",
- .insns = {
- /* Create an unknown offset, (4n+2)-aligned and bounded
- * to [14,74]
- */
- LOAD_UNKNOWN(BPF_REG_6),
- BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
- BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xf),
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
- /* Subtract it from the packet pointer */
- BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
- BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_6),
- /* Create another unknown, (4n)-aligned and >= 74.
- * That in fact means >= 76, since 74 % 4 == 2
- */
- BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 76),
- /* Add it to the packet pointer */
- BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_7),
- /* Check bounds and perform a read */
- BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
- BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
- BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
- BPF_EXIT_INSN(),
- BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
- BPF_EXIT_INSN(),
- },
- .prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .matches = {
- /* Calculated offset in R6 has unknown value, but known
- * alignment of 4.
- */
- {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
- {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
- /* Adding 14 makes R6 be (4n+2) */
- {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
- /* Subtracting from packet pointer overflows ubounds */
- {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"},
- /* New unknown value in R7 is (4n), >= 76 */
- {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
- /* Adding it to packet pointer gives nice bounds again */
- {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
- /* At the time the word size load is performed from R5,
- * its total fixed offset is NET_IP_ALIGN + reg->off (0)
- * which is 2. Then the variable offset is (4n+2), so
- * the total offset is 4-byte aligned and meets the
- * load's requirements.
- */
- {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
- },
- },
-};
-
-static int probe_filter_length(const struct bpf_insn *fp)
-{
- int len;
-
- for (len = MAX_INSNS - 1; len > 0; --len)
- if (fp[len].code != 0 || fp[len].imm != 0)
- break;
- return len + 1;
-}
-
-static char bpf_vlog[32768];
-
-static int do_test_single(struct bpf_align_test *test)
-{
- struct bpf_insn *prog = test->insns;
- int prog_type = test->prog_type;
- char bpf_vlog_copy[32768];
- const char *line_ptr;
- int cur_line = -1;
- int prog_len, i;
- int fd_prog;
- int ret;
-
- prog_len = probe_filter_length(prog);
- fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
- prog, prog_len, BPF_F_STRICT_ALIGNMENT,
- "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
- if (fd_prog < 0 && test->result != REJECT) {
- printf("Failed to load program.\n");
- printf("%s", bpf_vlog);
- ret = 1;
- } else if (fd_prog >= 0 && test->result == REJECT) {
- printf("Unexpected success to load!\n");
- printf("%s", bpf_vlog);
- ret = 1;
- close(fd_prog);
- } else {
- ret = 0;
- /* We make a local copy so that we can strtok() it */
- strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy));
- line_ptr = strtok(bpf_vlog_copy, "\n");
- for (i = 0; i < MAX_MATCHES; i++) {
- struct bpf_reg_match m = test->matches[i];
-
- if (!m.match)
- break;
- while (line_ptr) {
- cur_line = -1;
- sscanf(line_ptr, "%u: ", &cur_line);
- if (cur_line == m.line)
- break;
- line_ptr = strtok(NULL, "\n");
- }
- if (!line_ptr) {
- printf("Failed to find line %u for match: %s\n",
- m.line, m.match);
- ret = 1;
- printf("%s", bpf_vlog);
- break;
- }
- if (!strstr(line_ptr, m.match)) {
- printf("Failed to find match %u: %s\n",
- m.line, m.match);
- ret = 1;
- printf("%s", bpf_vlog);
- break;
- }
- }
- if (fd_prog >= 0)
- close(fd_prog);
- }
- return ret;
-}
-
-static int do_test(unsigned int from, unsigned int to)
-{
- int all_pass = 0;
- int all_fail = 0;
- unsigned int i;
-
- for (i = from; i < to; i++) {
- struct bpf_align_test *test = &tests[i];
- int fail;
-
- printf("Test %3d: %s ... ",
- i, test->descr);
- fail = do_test_single(test);
- if (fail) {
- all_fail++;
- printf("FAIL\n");
- } else {
- all_pass++;
- printf("PASS\n");
- }
- }
- printf("Results: %d pass %d fail\n",
- all_pass, all_fail);
- return all_fail ? EXIT_FAILURE : EXIT_SUCCESS;
-}
-
-int main(int argc, char **argv)
-{
- unsigned int from = 0, to = ARRAY_SIZE(tests);
-
- if (argc == 3) {
- unsigned int l = atoi(argv[argc - 2]);
- unsigned int u = atoi(argv[argc - 1]);
-
- if (l < to && u < to) {
- from = l;
- to = u + 1;
- }
- } else if (argc == 2) {
- unsigned int t = atoi(argv[argc - 1]);
-
- if (t < to) {
- from = t;
- to = t + 1;
- }
- }
- return do_test(from, to);
-}
#define S1_PORT 10000
#define S2_PORT 10001
-#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o"
#define CG_PATH "/sockmap"
int prog_fd[11];
int txmsg_pass;
-int txmsg_noisy;
int txmsg_redir;
-int txmsg_redir_noisy;
int txmsg_drop;
int txmsg_apply;
int txmsg_cork;
{"help", no_argument, NULL, 'h' },
{"cgroup", required_argument, NULL, 'c' },
{"rate", required_argument, NULL, 'r' },
- {"verbose", no_argument, NULL, 'v' },
+ {"verbose", optional_argument, NULL, 'v' },
{"iov_count", required_argument, NULL, 'i' },
{"length", required_argument, NULL, 'l' },
{"test", required_argument, NULL, 't' },
{"data_test", no_argument, NULL, 'd' },
{"txmsg", no_argument, &txmsg_pass, 1 },
- {"txmsg_noisy", no_argument, &txmsg_noisy, 1 },
{"txmsg_redir", no_argument, &txmsg_redir, 1 },
- {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1},
{"txmsg_drop", no_argument, &txmsg_drop, 1 },
{"txmsg_apply", required_argument, NULL, 'a'},
{"txmsg_cork", required_argument, NULL, 'k'},
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
+ {"whitelist", required_argument, NULL, 'n' },
+ {"blacklist", required_argument, NULL, 'b' },
{0, 0, NULL, 0 }
};
+struct test_env {
+ const char *type;
+ const char *subtest;
+ const char *prepend;
+
+ int test_num;
+ int subtest_num;
+
+ int succ_cnt;
+ int fail_cnt;
+ int fail_last;
+};
+
+struct test_env env;
+
+struct sockmap_options {
+ int verbose;
+ bool base;
+ bool sendpage;
+ bool data_test;
+ bool drop_expected;
+ int iov_count;
+ int iov_length;
+ int rate;
+ char *map;
+ char *whitelist;
+ char *blacklist;
+ char *prepend;
+};
+
+struct _test {
+ char *title;
+ void (*tester)(int cg_fd, struct sockmap_options *opt);
+};
+
+static void test_start(void)
+{
+ env.subtest_num++;
+}
+
+static void test_fail(void)
+{
+ env.fail_cnt++;
+}
+
+static void test_pass(void)
+{
+ env.succ_cnt++;
+}
+
+static void test_reset(void)
+{
+ txmsg_start = txmsg_end = 0;
+ txmsg_start_pop = txmsg_pop = 0;
+ txmsg_start_push = txmsg_end_push = 0;
+ txmsg_pass = txmsg_drop = txmsg_redir = 0;
+ txmsg_apply = txmsg_cork = 0;
+ txmsg_ingress = txmsg_skb = 0;
+}
+
+static int test_start_subtest(const struct _test *t, struct sockmap_options *o)
+{
+ env.type = o->map;
+ env.subtest = t->title;
+ env.prepend = o->prepend;
+ env.test_num++;
+ env.subtest_num = 0;
+ env.fail_last = env.fail_cnt;
+ test_reset();
+ return 0;
+}
+
+static void test_end_subtest(void)
+{
+ int error = env.fail_cnt - env.fail_last;
+ int type = strcmp(env.type, BPF_SOCKMAP_FILENAME);
+
+ if (!error)
+ test_pass();
+
+ fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n",
+ env.test_num, env.subtest_num,
+ !type ? "sockmap" : "sockhash",
+ env.prepend ? : "",
+ env.subtest, error ? "FAIL" : "OK");
+}
+
+static void test_print_results(void)
+{
+ fprintf(stdout, "Pass: %d Fail: %d\n",
+ env.succ_cnt, env.fail_cnt);
+}
+
static void usage(char *argv[])
{
int i;
return errno;
}
- if (verbose) {
+ if (verbose > 1) {
printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
c1, s1, c2, s2);
struct timespec end;
};
-struct sockmap_options {
- int verbose;
- bool base;
- bool sendpage;
- bool data_test;
- bool drop_expected;
- int iov_count;
- int iov_length;
- int rate;
-};
-
static int msg_loop_sendpage(int fd, int iov_length, int cnt,
struct msg_stats *s,
struct sockmap_options *opt)
clock_gettime(CLOCK_MONOTONIC, &s->start);
for (i = 0; i < cnt; i++) {
- int sent = sendfile(fd, fp, NULL, iov_length);
+ int sent;
+
+ errno = 0;
+ sent = sendfile(fd, fp, NULL, iov_length);
if (!drop && sent < 0) {
- perror("send loop error");
+ perror("sendpage loop error");
fclose(file);
return sent;
} else if (drop && sent >= 0) {
- printf("sendpage loop error expected: %i\n", sent);
+ printf("sendpage loop error expected: %i errno %i\n",
+ sent, errno);
fclose(file);
return -EIO;
}
if (tx) {
clock_gettime(CLOCK_MONOTONIC, &s->start);
for (i = 0; i < cnt; i++) {
- int sent = sendmsg(fd, &msg, flags);
+ int sent;
+
+ errno = 0;
+ sent = sendmsg(fd, &msg, flags);
if (!drop && sent < 0) {
- perror("send loop error");
+ perror("sendmsg loop error");
goto out_errno;
} else if (drop && sent >= 0) {
- printf("send loop error expected: %i\n", sent);
+ fprintf(stderr,
+ "sendmsg loop error expected: %i errno %i\n",
+ sent, errno);
errno = -EIO;
goto out_errno;
}
* paths.
*/
total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
- txmsg_pop_total = txmsg_pop;
if (txmsg_apply)
- txmsg_pop_total *= (total_bytes / txmsg_apply);
+ txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply);
+ else
+ txmsg_pop_total = txmsg_pop * cnt;
total_bytes -= txmsg_pop_total;
err = clock_gettime(CLOCK_MONOTONIC, &s->start);
if (err < 0)
rxpid = fork();
if (rxpid == 0) {
+ iov_buf -= (txmsg_pop - txmsg_start_pop + 1);
if (opt->drop_expected)
- exit(0);
+ _exit(0);
+
+ if (!iov_buf) /* zero bytes sent case */
+ _exit(0);
if (opt->sendpage)
iov_count = 1;
err = msg_loop(rx_fd, iov_count, iov_buf,
cnt, &s, false, opt);
- if (opt->verbose)
+ if (opt->verbose > 1)
fprintf(stderr,
"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
iov_count, iov_buf, cnt, err);
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
}
- if (opt->verbose)
+ if (opt->verbose > 1)
fprintf(stdout,
"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
}
- if (opt->verbose)
+ if (opt->verbose > 1)
fprintf(stdout,
"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
s.bytes_sent, sent_Bps, sent_Bps/giga,
if (WIFEXITED(rx_status)) {
err = WEXITSTATUS(rx_status);
if (err) {
- fprintf(stderr, "rx thread exited with err %d. ", err);
+ fprintf(stderr, "rx thread exited with err %d.\n", err);
goto out;
}
}
if (WIFEXITED(tx_status)) {
err = WEXITSTATUS(tx_status);
if (err)
- fprintf(stderr, "tx thread exited with err %d. ", err);
+ fprintf(stderr, "tx thread exited with err %d.\n", err);
}
out:
return err;
}
enum {
+ SELFTESTS,
PING_PONG,
SENDMSG,
BASE,
/* Attach txmsg program to sockmap */
if (txmsg_pass)
tx_prog_fd = prog_fd[3];
- else if (txmsg_noisy)
- tx_prog_fd = prog_fd[4];
else if (txmsg_redir)
+ tx_prog_fd = prog_fd[4];
+ else if (txmsg_apply)
tx_prog_fd = prog_fd[5];
- else if (txmsg_redir_noisy)
+ else if (txmsg_cork)
tx_prog_fd = prog_fd[6];
else if (txmsg_drop)
- tx_prog_fd = prog_fd[9];
- /* apply and cork must be last */
- else if (txmsg_apply)
tx_prog_fd = prog_fd[7];
- else if (txmsg_cork)
- tx_prog_fd = prog_fd[8];
else
tx_prog_fd = 0;
goto out;
}
- if (txmsg_redir || txmsg_redir_noisy)
+ if (txmsg_redir)
redir_fd = c2;
else
redir_fd = c1;
if (txmsg_pass)
strncat(options, "pass,", OPTSTRING);
- if (txmsg_noisy)
- strncat(options, "pass_noisy,", OPTSTRING);
if (txmsg_redir)
strncat(options, "redir,", OPTSTRING);
- if (txmsg_redir_noisy)
- strncat(options, "redir_noisy,", OPTSTRING);
if (txmsg_drop)
strncat(options, "drop,", OPTSTRING);
if (txmsg_apply) {
test_options(options);
- fprintf(stdout,
- "[TEST %i]: (%i, %i, %i, %s, %s): ",
- test_cnt, opt->rate, opt->iov_count, opt->iov_length,
- test_to_str(test), options);
- fflush(stdout);
+ if (opt->verbose) {
+ fprintf(stdout,
+ " [TEST %i]: (%i, %i, %i, %s, %s): ",
+ test_cnt, opt->rate, opt->iov_count, opt->iov_length,
+ test_to_str(test), options);
+ fflush(stdout);
+ }
err = run_options(opt, cgrp, test);
- fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED");
+ if (opt->verbose)
+ fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED");
test_cnt++;
!err ? passed++ : failed++;
free(options);
return err;
}
-static int test_exec(int cgrp, struct sockmap_options *opt)
-{
- int err = __test_exec(cgrp, SENDMSG, opt);
-
- if (err)
- goto out;
-
- err = __test_exec(cgrp, SENDPAGE, opt);
-out:
- return err;
-}
-
-static int test_loop(int cgrp)
-{
- struct sockmap_options opt;
-
- int err, i, l, r;
-
- opt.verbose = 0;
- opt.base = false;
- opt.sendpage = false;
- opt.data_test = false;
- opt.drop_expected = false;
- opt.iov_count = 0;
- opt.iov_length = 0;
- opt.rate = 0;
-
- r = 1;
- for (i = 1; i < 100; i += 33) {
- for (l = 1; l < 100; l += 33) {
- opt.rate = r;
- opt.iov_count = i;
- opt.iov_length = l;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
- }
- sched_yield();
-out:
- return err;
-}
-
-static int test_txmsg(int cgrp)
+static void test_exec(int cgrp, struct sockmap_options *opt)
{
+ int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME);
int err;
- txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
- txmsg_apply = txmsg_cork = 0;
- txmsg_ingress = txmsg_skb = 0;
-
- txmsg_pass = 1;
- err = test_loop(cgrp);
- txmsg_pass = 0;
- if (err)
- goto out;
-
- txmsg_redir = 1;
- err = test_loop(cgrp);
- txmsg_redir = 0;
- if (err)
- goto out;
-
- txmsg_drop = 1;
- err = test_loop(cgrp);
- txmsg_drop = 0;
- if (err)
- goto out;
-
- txmsg_redir = 1;
- txmsg_ingress = 1;
- err = test_loop(cgrp);
- txmsg_redir = 0;
- txmsg_ingress = 0;
- if (err)
- goto out;
-out:
- txmsg_pass = 0;
- txmsg_redir = 0;
- txmsg_drop = 0;
- return err;
+ if (type == 0) {
+ test_start();
+ err = __test_exec(cgrp, SENDMSG, opt);
+ if (err)
+ test_fail();
+ } else {
+ test_start();
+ err = __test_exec(cgrp, SENDPAGE, opt);
+ if (err)
+ test_fail();
+ }
}
-static int test_send(struct sockmap_options *opt, int cgrp)
+static void test_send_one(struct sockmap_options *opt, int cgrp)
{
- int err;
-
opt->iov_length = 1;
opt->iov_count = 1;
opt->rate = 1;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
opt->iov_length = 1;
opt->iov_count = 1024;
opt->rate = 1;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
opt->iov_length = 1024;
opt->iov_count = 1;
opt->rate = 1;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
- opt->iov_length = 1;
+}
+
+static void test_send_many(struct sockmap_options *opt, int cgrp)
+{
+ opt->iov_length = 3;
opt->iov_count = 1;
opt->rate = 512;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
-
- opt->iov_length = 256;
- opt->iov_count = 1024;
- opt->rate = 2;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
+ test_exec(cgrp, opt);
opt->rate = 100;
opt->iov_count = 1;
opt->iov_length = 5;
- err = test_exec(cgrp, opt);
- if (err)
- goto out;
-out:
- sched_yield();
- return err;
+ test_exec(cgrp, opt);
}
-static int test_mixed(int cgrp)
+static void test_send_large(struct sockmap_options *opt, int cgrp)
{
- struct sockmap_options opt = {0};
- int err;
+ opt->iov_length = 256;
+ opt->iov_count = 1024;
+ opt->rate = 2;
+ test_exec(cgrp, opt);
+}
- txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
- txmsg_apply = txmsg_cork = 0;
- txmsg_start = txmsg_end = 0;
- txmsg_start_push = txmsg_end_push = 0;
- txmsg_start_pop = txmsg_pop = 0;
+static void test_send(struct sockmap_options *opt, int cgrp)
+{
+ test_send_one(opt, cgrp);
+ test_send_many(opt, cgrp);
+ test_send_large(opt, cgrp);
+ sched_yield();
+}
+static void test_txmsg_pass(int cgrp, struct sockmap_options *opt)
+{
/* Test small and large iov_count values with pass/redir/apply/cork */
txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 1;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ test_send(opt, cgrp);
+}
- txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 0;
- txmsg_cork = 1;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_redir(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_redir = 1;
+ test_send(opt, cgrp);
+}
- txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 1;
- txmsg_cork = 1;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_drop(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_drop = 1;
+ test_send(opt, cgrp);
+}
- txmsg_pass = 1;
- txmsg_redir = 0;
- txmsg_apply = 1024;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = txmsg_drop = 0;
+ txmsg_ingress = txmsg_redir = 1;
+ test_send(opt, cgrp);
+}
+/* Test cork with hung data. This tests poor usage patterns where
+ * cork can leave data on the ring if user program is buggy and
+ * doesn't flush them somehow. They do take some time however
+ * because they wait for a timeout. Test pass, redir and cork with
+ * apply logic. Use cork size of 4097 with send_large to avoid
+ * aligning cork size with send size.
+ */
+static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt)
+{
txmsg_pass = 1;
txmsg_redir = 0;
+ txmsg_cork = 4097;
+ txmsg_apply = 4097;
+ test_send_large(opt, cgrp);
+
+ txmsg_pass = 0;
+ txmsg_redir = 1;
txmsg_apply = 0;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_cork = 4097;
+ test_send_large(opt, cgrp);
- txmsg_pass = 1;
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 4097;
+ txmsg_cork = 4097;
+ test_send_large(opt, cgrp);
+}
+
+static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
+{
+ /* Test basic start/end */
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send(opt, cgrp);
+
+ /* Test >4k pull */
+ txmsg_start = 4096;
+ txmsg_end = 9182;
+ test_send_large(opt, cgrp);
+
+ /* Test pull + redirect */
txmsg_redir = 0;
- txmsg_apply = 1024;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send(opt, cgrp);
- txmsg_pass = 1;
+ /* Test pull + cork */
txmsg_redir = 0;
- txmsg_cork = 4096;
- txmsg_apply = 4096;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_cork = 512;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
+ /* Test pull + cork + redirect */
txmsg_redir = 1;
- txmsg_apply = 1;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_cork = 512;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send_many(opt, cgrp);
+}
- txmsg_pass = 0;
- txmsg_redir = 1;
- txmsg_apply = 0;
- txmsg_cork = 1;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
+{
+ /* Test basic pop */
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
- txmsg_redir = 1;
- txmsg_apply = 1024;
- txmsg_cork = 0;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ /* Test pop with >4k */
+ txmsg_start_pop = 4096;
+ txmsg_pop = 4096;
+ test_send_large(opt, cgrp);
- txmsg_pass = 0;
+ /* Test pop + redirect */
txmsg_redir = 1;
- txmsg_apply = 0;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
- txmsg_redir = 1;
- txmsg_apply = 1024;
- txmsg_cork = 1024;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
+ /* Test pop + cork */
+ txmsg_redir = 0;
+ txmsg_cork = 512;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
- txmsg_pass = 0;
+ /* Test pop + redirect + cork */
txmsg_redir = 1;
- txmsg_cork = 4096;
- txmsg_apply = 4096;
- err = test_send(&opt, cgrp);
- if (err)
- goto out;
-out:
- return err;
+ txmsg_cork = 4;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
}
-static int test_start_end(int cgrp)
+static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
{
- struct sockmap_options opt = {0};
- int err, i;
+ /* Test basic push */
+ txmsg_start_push = 1;
+ txmsg_end_push = 1;
+ test_send(opt, cgrp);
- /* Test basic start/end with lots of iov_count and iov_lengths */
- txmsg_start = 1;
- txmsg_end = 2;
+ /* Test push 4kB >4k */
+ txmsg_start_push = 4096;
+ txmsg_end_push = 4096;
+ test_send_large(opt, cgrp);
+
+ /* Test push + redirect */
+ txmsg_redir = 1;
txmsg_start_push = 1;
txmsg_end_push = 2;
- txmsg_start_pop = 1;
- txmsg_pop = 1;
- err = test_txmsg(cgrp);
- if (err)
- goto out;
+ test_send_many(opt, cgrp);
- /* Cut a byte of pushed data but leave reamining in place */
- txmsg_start = 1;
- txmsg_end = 2;
+ /* Test push + cork */
+ txmsg_redir = 0;
+ txmsg_cork = 512;
txmsg_start_push = 1;
- txmsg_end_push = 3;
- txmsg_start_pop = 1;
- txmsg_pop = 1;
- err = test_txmsg(cgrp);
- if (err)
- goto out;
-
- /* Test start/end with cork */
- opt.rate = 16;
- opt.iov_count = 1;
- opt.iov_length = 100;
- txmsg_cork = 1600;
-
- txmsg_start_pop = 0;
- txmsg_pop = 0;
-
- for (i = 99; i <= 1600; i += 500) {
- txmsg_start = 0;
- txmsg_end = i;
- txmsg_start_push = 0;
- txmsg_end_push = i;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
-
- /* Test pop data in middle of cork */
- for (i = 99; i <= 1600; i += 500) {
- txmsg_start_pop = 10;
- txmsg_pop = i;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
- txmsg_start_pop = 0;
- txmsg_pop = 0;
-
- /* Test start/end with cork but pull data in middle */
- for (i = 199; i <= 1600; i += 500) {
- txmsg_start = 100;
- txmsg_end = i;
- txmsg_start_push = 100;
- txmsg_end_push = i;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- }
-
- /* Test start/end with cork pulling last sg entry */
- txmsg_start = 1500;
- txmsg_end = 1600;
- txmsg_start_push = 1500;
- txmsg_end_push = 1600;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_end_push = 2;
+ test_send_many(opt, cgrp);
+}
- /* Test pop with cork pulling last sg entry */
- txmsg_start_pop = 1500;
- txmsg_pop = 1600;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
- txmsg_start_pop = 0;
- txmsg_pop = 0;
-
- /* Test start/end pull of single byte in last page */
- txmsg_start = 1111;
- txmsg_end = 1112;
- txmsg_start_push = 1111;
- txmsg_end_push = 1112;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_start_push = 1;
+ txmsg_end_push = 10;
+ txmsg_start_pop = 5;
+ txmsg_pop = 4;
+ test_send_large(opt, cgrp);
+}
- /* Test pop of single byte in last page */
- txmsg_start_pop = 1111;
- txmsg_pop = 1112;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+static void test_txmsg_apply(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1;
+ txmsg_cork = 0;
+ test_send_one(opt, cgrp);
- /* Test start/end with end < start */
- txmsg_start = 1111;
- txmsg_end = 0;
- txmsg_start_push = 1111;
- txmsg_end_push = 0;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 1;
+ txmsg_cork = 0;
+ test_send_one(opt, cgrp);
- /* Test start/end with end > data */
- txmsg_start = 0;
- txmsg_end = 1601;
- txmsg_start_push = 0;
- txmsg_end_push = 1601;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1024;
+ txmsg_cork = 0;
+ test_send_large(opt, cgrp);
- /* Test start/end with start > data */
- txmsg_start = 1601;
- txmsg_end = 1600;
- txmsg_start_push = 1601;
- txmsg_end_push = 1600;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 1024;
+ txmsg_cork = 0;
+ test_send_large(opt, cgrp);
+}
- /* Test pop with start > data */
- txmsg_start_pop = 1601;
- txmsg_pop = 1;
- err = test_exec(cgrp, &opt);
- if (err)
- goto out;
+static void test_txmsg_cork(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 0;
+ txmsg_cork = 1;
+ test_send(opt, cgrp);
- /* Test pop with pop range > data */
- txmsg_start_pop = 1599;
- txmsg_pop = 10;
- err = test_exec(cgrp, &opt);
-out:
- txmsg_start = 0;
- txmsg_end = 0;
- sched_yield();
- return err;
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1;
+ txmsg_cork = 1;
+ test_send(opt, cgrp);
}
char *map_names[] = {
return 0;
}
-static int __test_suite(int cg_fd, char *bpf_file)
+struct _test test[] = {
+ {"txmsg test passthrough", test_txmsg_pass},
+ {"txmsg test redirect", test_txmsg_redir},
+ {"txmsg test drop", test_txmsg_drop},
+ {"txmsg test ingress redirect", test_txmsg_ingress_redir},
+ {"txmsg test apply", test_txmsg_apply},
+ {"txmsg test cork", test_txmsg_cork},
+ {"txmsg test hanging corks", test_txmsg_cork_hangs},
+ {"txmsg test push_data", test_txmsg_push},
+ {"txmsg test pull-data", test_txmsg_pull},
+ {"txmsg test pop-data", test_txmsg_pop},
+ {"txmsg test push/pop data", test_txmsg_push_pop},
+};
+
+static int check_whitelist(struct _test *t, struct sockmap_options *opt)
{
- int err, cleanup = cg_fd;
+ char *entry, *ptr;
+
+ if (!opt->whitelist)
+ return 0;
+ ptr = strdup(opt->whitelist);
+ if (!ptr)
+ return -ENOMEM;
+ entry = strtok(ptr, ",");
+ while (entry) {
+ if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+ strstr(opt->map, entry) != 0 ||
+ strstr(t->title, entry) != 0)
+ return 0;
+ entry = strtok(NULL, ",");
+ }
+ return -EINVAL;
+}
- err = populate_progs(bpf_file);
+static int check_blacklist(struct _test *t, struct sockmap_options *opt)
+{
+ char *entry, *ptr;
+
+ if (!opt->blacklist)
+ return -EINVAL;
+ ptr = strdup(opt->blacklist);
+ if (!ptr)
+ return -ENOMEM;
+ entry = strtok(ptr, ",");
+ while (entry) {
+ if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+ strstr(opt->map, entry) != 0 ||
+ strstr(t->title, entry) != 0)
+ return 0;
+ entry = strtok(NULL, ",");
+ }
+ return -EINVAL;
+}
+
+static int __test_selftests(int cg_fd, struct sockmap_options *opt)
+{
+ int i, err;
+
+ err = populate_progs(opt->map);
if (err < 0) {
fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
return err;
}
- if (cg_fd < 0) {
- if (setup_cgroup_environment()) {
- fprintf(stderr, "ERROR: cgroup env failed\n");
- return -EINVAL;
- }
+ /* Tests basic commands and APIs */
+ for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) {
+ struct _test t = test[i];
- cg_fd = create_and_get_cgroup(CG_PATH);
- if (cg_fd < 0) {
- fprintf(stderr,
- "ERROR: (%i) open cg path failed: %s\n",
- cg_fd, optarg);
- return cg_fd;
- }
+ if (check_whitelist(&t, opt) != 0)
+ continue;
+ if (check_blacklist(&t, opt) == 0)
+ continue;
- if (join_cgroup(CG_PATH)) {
- fprintf(stderr, "ERROR: failed to join cgroup\n");
- return -EINVAL;
- }
+ test_start_subtest(&t, opt);
+ t.tester(cg_fd, opt);
+ test_end_subtest();
}
- /* Tests basic commands and APIs with range of iov values */
- txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0;
- err = test_txmsg(cg_fd);
- if (err)
- goto out;
+ return err;
+}
- /* Tests interesting combinations of APIs used together */
- err = test_mixed(cg_fd);
- if (err)
- goto out;
+static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKMAP_FILENAME;
+ __test_selftests(cg_fd, opt);
+}
- /* Tests pull_data API using start/end API */
- err = test_start_end(cg_fd);
- if (err)
- goto out;
+static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKHASH_FILENAME;
+ __test_selftests(cg_fd, opt);
+}
-out:
- printf("Summary: %i PASSED %i FAILED\n", passed, failed);
- if (cleanup < 0) {
- cleanup_cgroup_environment();
- close(cg_fd);
- }
- return err;
+static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKHASH_FILENAME;
+ opt->prepend = "ktls";
+ ktls = 1;
+ __test_selftests(cg_fd, opt);
+ ktls = 0;
}
-static int test_suite(int cg_fd)
+static int test_selftest(int cg_fd, struct sockmap_options *opt)
{
- int err;
- err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME);
- if (err)
- goto out;
- err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME);
-out:
- if (cg_fd > -1)
- close(cg_fd);
- return err;
+ test_selftests_sockmap(cg_fd, opt);
+ test_selftests_sockhash(cg_fd, opt);
+ test_selftests_ktls(cg_fd, opt);
+ test_print_results();
+ return 0;
}
int main(int argc, char **argv)
struct sockmap_options options = {0};
int opt, longindex, err, cg_fd = 0;
char *bpf_file = BPF_SOCKMAP_FILENAME;
- int test = PING_PONG;
+ int test = SELFTESTS;
+ bool cg_created = 0;
- if (argc < 2)
- return test_suite(-1);
-
- while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
+ while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:",
long_options, &longindex)) != -1) {
switch (opt) {
case 's':
break;
case 'v':
options.verbose = 1;
+ if (optarg)
+ options.verbose = atoi(optarg);
break;
case 'i':
iov_count = atoi(optarg);
return -1;
}
break;
+ case 'n':
+ options.whitelist = strdup(optarg);
+ if (!options.whitelist)
+ return -ENOMEM;
+ break;
+ case 'b':
+ options.blacklist = strdup(optarg);
+ if (!options.blacklist)
+ return -ENOMEM;
case 0:
break;
case 'h':
}
}
- if (argc <= 3 && cg_fd)
- return test_suite(cg_fd);
-
if (!cg_fd) {
- fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
- argv[0]);
- return -1;
+ if (setup_cgroup_environment()) {
+ fprintf(stderr, "ERROR: cgroup env failed\n");
+ return -EINVAL;
+ }
+
+ cg_fd = create_and_get_cgroup(CG_PATH);
+ if (cg_fd < 0) {
+ fprintf(stderr,
+ "ERROR: (%i) open cg path failed: %s\n",
+ cg_fd, strerror(errno));
+ return cg_fd;
+ }
+
+ if (join_cgroup(CG_PATH)) {
+ fprintf(stderr, "ERROR: failed to join cgroup\n");
+ return -EINVAL;
+ }
+ cg_created = 1;
+ }
+
+ if (test == SELFTESTS) {
+ err = test_selftest(cg_fd, &options);
+ goto out;
}
err = populate_progs(bpf_file);
options.rate = rate;
err = run_options(&options, cg_fd, test);
+out:
+ if (options.whitelist)
+ free(options.whitelist);
+ if (options.blacklist)
+ free(options.blacklist);
+ if (cg_created)
+ cleanup_cgroup_environment();
close(cg_fd);
return err;
}
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
-#include <stddef.h>
-#include <string.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/in.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/pkt_cls.h>
-#include <sys/socket.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_endian.h>
-
-/* Sockmap sample program connects a client and a backend together
- * using cgroups.
- *
- * client:X <---> frontend:80 client:X <---> backend:80
- *
- * For simplicity we hard code values here and bind 1:1. The hard
- * coded values are part of the setup in sockmap.sh script that
- * is associated with this BPF program.
- *
- * The bpf_printk is verbose and prints information as connections
- * are established and verdicts are decided.
- */
-
-struct {
- __uint(type, TEST_MAP_TYPE);
- __uint(max_entries, 20);
- __uint(key_size, sizeof(int));
- __uint(value_size, sizeof(int));
-} sock_map SEC(".maps");
-
-struct {
- __uint(type, TEST_MAP_TYPE);
- __uint(max_entries, 20);
- __uint(key_size, sizeof(int));
- __uint(value_size, sizeof(int));
-} sock_map_txmsg SEC(".maps");
-
-struct {
- __uint(type, TEST_MAP_TYPE);
- __uint(max_entries, 20);
- __uint(key_size, sizeof(int));
- __uint(value_size, sizeof(int));
-} sock_map_redir SEC(".maps");
-
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(max_entries, 1);
- __type(key, int);
- __type(value, int);
-} sock_apply_bytes SEC(".maps");
-
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(max_entries, 1);
- __type(key, int);
- __type(value, int);
-} sock_cork_bytes SEC(".maps");
-
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(max_entries, 6);
- __type(key, int);
- __type(value, int);
-} sock_bytes SEC(".maps");
-
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(max_entries, 1);
- __type(key, int);
- __type(value, int);
-} sock_redir_flags SEC(".maps");
-
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(max_entries, 1);
- __type(key, int);
- __type(value, int);
-} sock_skb_opts SEC(".maps");
-
-SEC("sk_skb1")
-int bpf_prog1(struct __sk_buff *skb)
-{
- return skb->len;
-}
-
-SEC("sk_skb2")
-int bpf_prog2(struct __sk_buff *skb)
-{
- __u32 lport = skb->local_port;
- __u32 rport = skb->remote_port;
- int len, *f, ret, zero = 0;
- __u64 flags = 0;
-
- if (lport == 10000)
- ret = 10;
- else
- ret = 1;
-
- len = (__u32)skb->data_end - (__u32)skb->data;
- f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
- if (f && *f) {
- ret = 3;
- flags = *f;
- }
-
- bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
- len, flags);
-#ifdef SOCKMAP
- return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
-#else
- return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags);
-#endif
-
-}
-
-SEC("sockops")
-int bpf_sockmap(struct bpf_sock_ops *skops)
-{
- __u32 lport, rport;
- int op, err = 0, index, key, ret;
-
-
- op = (int) skops->op;
-
- switch (op) {
- case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
- lport = skops->local_port;
- rport = skops->remote_port;
-
- if (lport == 10000) {
- ret = 1;
-#ifdef SOCKMAP
- err = bpf_sock_map_update(skops, &sock_map, &ret,
- BPF_NOEXIST);
-#else
- err = bpf_sock_hash_update(skops, &sock_map, &ret,
- BPF_NOEXIST);
-#endif
- bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
- lport, bpf_ntohl(rport), err);
- }
- break;
- case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
- lport = skops->local_port;
- rport = skops->remote_port;
-
- if (bpf_ntohl(rport) == 10001) {
- ret = 10;
-#ifdef SOCKMAP
- err = bpf_sock_map_update(skops, &sock_map, &ret,
- BPF_NOEXIST);
-#else
- err = bpf_sock_hash_update(skops, &sock_map, &ret,
- BPF_NOEXIST);
-#endif
- bpf_printk("active(%i -> %i) map ctx update err: %d\n",
- lport, bpf_ntohl(rport), err);
- }
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-SEC("sk_msg1")
-int bpf_prog4(struct sk_msg_md *msg)
-{
- int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
- int *start, *end, *start_push, *end_push, *start_pop, *pop;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- bpf_msg_cork_bytes(msg, *bytes);
- start = bpf_map_lookup_elem(&sock_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_bytes, &one);
- if (start && end)
- bpf_msg_pull_data(msg, *start, *end, 0);
- start_push = bpf_map_lookup_elem(&sock_bytes, &two);
- end_push = bpf_map_lookup_elem(&sock_bytes, &three);
- if (start_push && end_push)
- bpf_msg_push_data(msg, *start_push, *end_push, 0);
- start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
- pop = bpf_map_lookup_elem(&sock_bytes, &five);
- if (start_pop && pop)
- bpf_msg_pop_data(msg, *start_pop, *pop, 0);
- return SK_PASS;
-}
-
-SEC("sk_msg2")
-int bpf_prog5(struct sk_msg_md *msg)
-{
- int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
- int *start, *end, *start_push, *end_push, *start_pop, *pop;
- int *bytes, len1, len2 = 0, len3, len4;
- int err1 = -1, err2 = -1;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- err1 = bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- err2 = bpf_msg_cork_bytes(msg, *bytes);
- len1 = (__u64)msg->data_end - (__u64)msg->data;
- start = bpf_map_lookup_elem(&sock_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_bytes, &one);
- if (start && end) {
- int err;
-
- bpf_printk("sk_msg2: pull(%i:%i)\n",
- start ? *start : 0, end ? *end : 0);
- err = bpf_msg_pull_data(msg, *start, *end, 0);
- if (err)
- bpf_printk("sk_msg2: pull_data err %i\n",
- err);
- len2 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length update %i->%i\n",
- len1, len2);
- }
-
- start_push = bpf_map_lookup_elem(&sock_bytes, &two);
- end_push = bpf_map_lookup_elem(&sock_bytes, &three);
- if (start_push && end_push) {
- int err;
-
- bpf_printk("sk_msg2: push(%i:%i)\n",
- start_push ? *start_push : 0,
- end_push ? *end_push : 0);
- err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
- if (err)
- bpf_printk("sk_msg2: push_data err %i\n", err);
- len3 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length push_update %i->%i\n",
- len2 ? len2 : len1, len3);
- }
- start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
- pop = bpf_map_lookup_elem(&sock_bytes, &five);
- if (start_pop && pop) {
- int err;
-
- bpf_printk("sk_msg2: pop(%i@%i)\n",
- start_pop, pop);
- err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
- if (err)
- bpf_printk("sk_msg2: pop_data err %i\n", err);
- len4 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length pop_data %i->%i\n",
- len1 ? len1 : 0, len4);
- }
-
- bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
- len1, err1, err2);
- return SK_PASS;
-}
-
-SEC("sk_msg3")
-int bpf_prog6(struct sk_msg_md *msg)
-{
- int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
- int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
- __u64 flags = 0;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- bpf_msg_cork_bytes(msg, *bytes);
-
- start = bpf_map_lookup_elem(&sock_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_bytes, &one);
- if (start && end)
- bpf_msg_pull_data(msg, *start, *end, 0);
-
- start_push = bpf_map_lookup_elem(&sock_bytes, &two);
- end_push = bpf_map_lookup_elem(&sock_bytes, &three);
- if (start_push && end_push)
- bpf_msg_push_data(msg, *start_push, *end_push, 0);
-
- start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
- pop = bpf_map_lookup_elem(&sock_bytes, &five);
- if (start_pop && pop)
- bpf_msg_pop_data(msg, *start_pop, *pop, 0);
-
- f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
- if (f && *f) {
- key = 2;
- flags = *f;
- }
-#ifdef SOCKMAP
- return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-#else
- return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
-#endif
-}
-
-SEC("sk_msg4")
-int bpf_prog7(struct sk_msg_md *msg)
-{
- int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
- int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
- int len1, len2 = 0, len3, len4;
- int err1 = 0, err2 = 0, key = 0;
- __u64 flags = 0;
-
- int err;
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- err1 = bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- err2 = bpf_msg_cork_bytes(msg, *bytes);
- len1 = (__u64)msg->data_end - (__u64)msg->data;
-
- start = bpf_map_lookup_elem(&sock_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_bytes, &one);
- if (start && end) {
- bpf_printk("sk_msg2: pull(%i:%i)\n",
- start ? *start : 0, end ? *end : 0);
- err = bpf_msg_pull_data(msg, *start, *end, 0);
- if (err)
- bpf_printk("sk_msg2: pull_data err %i\n",
- err);
- len2 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg2: length update %i->%i\n",
- len1, len2);
- }
-
- start_push = bpf_map_lookup_elem(&sock_bytes, &two);
- end_push = bpf_map_lookup_elem(&sock_bytes, &three);
- if (start_push && end_push) {
- bpf_printk("sk_msg4: push(%i:%i)\n",
- start_push ? *start_push : 0,
- end_push ? *end_push : 0);
- err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
- if (err)
- bpf_printk("sk_msg4: push_data err %i\n",
- err);
- len3 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg4: length push_update %i->%i\n",
- len2 ? len2 : len1, len3);
- }
-
- start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
- pop = bpf_map_lookup_elem(&sock_bytes, &five);
- if (start_pop && pop) {
- int err;
-
- bpf_printk("sk_msg4: pop(%i@%i)\n",
- start_pop, pop);
- err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
- if (err)
- bpf_printk("sk_msg4: pop_data err %i\n", err);
- len4 = (__u64)msg->data_end - (__u64)msg->data;
- bpf_printk("sk_msg4: length pop_data %i->%i\n",
- len1 ? len1 : 0, len4);
- }
-
-
- f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
- if (f && *f) {
- key = 2;
- flags = *f;
- }
- bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
- len1, flags, err1 ? err1 : err2);
-#ifdef SOCKMAP
- err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-#else
- err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
-#endif
- bpf_printk("sk_msg3: err %i\n", err);
- return err;
-}
-
-SEC("sk_msg5")
-int bpf_prog8(struct sk_msg_md *msg)
-{
- void *data_end = (void *)(long) msg->data_end;
- void *data = (void *)(long) msg->data;
- int ret = 0, *bytes, zero = 0;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes) {
- ret = bpf_msg_apply_bytes(msg, *bytes);
- if (ret)
- return SK_DROP;
- } else {
- return SK_DROP;
- }
- return SK_PASS;
-}
-SEC("sk_msg6")
-int bpf_prog9(struct sk_msg_md *msg)
-{
- void *data_end = (void *)(long) msg->data_end;
- void *data = (void *)(long) msg->data;
- int ret = 0, *bytes, zero = 0;
-
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes) {
- if (((__u64)data_end - (__u64)data) >= *bytes)
- return SK_PASS;
- ret = bpf_msg_cork_bytes(msg, *bytes);
- if (ret)
- return SK_DROP;
- }
- return SK_PASS;
-}
-
-SEC("sk_msg7")
-int bpf_prog10(struct sk_msg_md *msg)
-{
- int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
- int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
-
- bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
- if (bytes)
- bpf_msg_apply_bytes(msg, *bytes);
- bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
- if (bytes)
- bpf_msg_cork_bytes(msg, *bytes);
- start = bpf_map_lookup_elem(&sock_bytes, &zero);
- end = bpf_map_lookup_elem(&sock_bytes, &one);
- if (start && end)
- bpf_msg_pull_data(msg, *start, *end, 0);
- start_push = bpf_map_lookup_elem(&sock_bytes, &two);
- end_push = bpf_map_lookup_elem(&sock_bytes, &three);
- if (start_push && end_push)
- bpf_msg_push_data(msg, *start_push, *end_push, 0);
- start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
- pop = bpf_map_lookup_elem(&sock_bytes, &five);
- if (start_pop && pop)
- bpf_msg_pop_data(msg, *start_pop, *pop, 0);
- bpf_printk("return sk drop\n");
- return SK_DROP;
-}
-
-int _version SEC("version") = 1;
-char _license[] SEC("license") = "GPL";
.result = REJECT,
.errstr = "invalid mem access",
},
+{
+ "reference tracking: branch tracking valid pointer null comparison",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: branch tracking valid pointer value comparison",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
.result_unpriv = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
+{
+ "map lookup and null branch prediction",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
ksft_skip=4
# all tests in this script. Can be overridden with -t option
-IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode"
-IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode"
+IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode ipv4_fdb_grp_fcnal"
+IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode ipv6_fdb_grp_fcnal"
ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}"
TESTS="${ALL_TESTS}"
create_ns remote
IP="ip -netns me"
+ BRIDGE="bridge -netns me"
set -e
$IP li add veth1 type veth peer name veth2
$IP li set veth1 up
return $rc
}
+check_nexthop_fdb_support()
+{
+ $IP nexthop help 2>&1 | grep -q fdb
+ if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing fdb nexthop support"
+ return $ksft_skip
+ fi
+}
+
+ipv6_fdb_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv6 fdb groups functional"
+ echo "--------------------------"
+
+ check_nexthop_fdb_support
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ # create group with multiple nexthops
+ run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb"
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb"
+ run_cmd "$IP nexthop add id 102 group 61/62 fdb"
+ check_nexthop "id 102" "id 102 group 61/62 fdb"
+ log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+ ## get nexthop group
+ run_cmd "$IP nexthop get id 102"
+ check_nexthop "id 102" "id 102 group 61/62 fdb"
+ log_test $? 0 "Get Fdb nexthop group by id"
+
+ # fdb nexthop group can only contain fdb nexthops
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::4"
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::5"
+ run_cmd "$IP nexthop add id 103 group 63/64 fdb"
+ log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+ # Non fdb nexthop group can not contain fdb nexthops
+ run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb"
+ run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb"
+ run_cmd "$IP nexthop add id 104 group 65/66"
+ log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+ # fdb nexthop cannot have blackhole
+ run_cmd "$IP nexthop add id 67 blackhole fdb"
+ log_test $? 2 "Fdb Nexthop with blackhole"
+
+ # fdb nexthop with oif
+ run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with oif"
+
+ # fdb nexthop with onlink
+ run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb"
+ log_test $? 2 "Fdb Nexthop with onlink"
+
+ # fdb nexthop with encap
+ run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with encap"
+
+ run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+ log_test $? 0 "Fdb mac add with nexthop group"
+
+ ## fdb nexthops can only reference nexthop groups and not nexthops
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self"
+ log_test $? 255 "Fdb mac add with nexthop"
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66"
+ log_test $? 2 "Route add with fdb nexthop"
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103"
+ log_test $? 2 "Route add with fdb nexthop group"
+
+ run_cmd "$IP nexthop del id 102"
+ log_test $? 0 "Fdb nexthop delete"
+
+ $IP link del dev vx10
+}
+
+ipv4_fdb_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv4 fdb groups functional"
+ echo "--------------------------"
+
+ check_nexthop_fdb_support
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ # create group with multiple nexthops
+ run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb"
+ run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb"
+ run_cmd "$IP nexthop add id 102 group 12/13 fdb"
+ check_nexthop "id 102" "id 102 group 12/13 fdb"
+ log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+ # get nexthop group
+ run_cmd "$IP nexthop get id 102"
+ check_nexthop "id 102" "id 102 group 12/13 fdb"
+ log_test $? 0 "Get Fdb nexthop group by id"
+
+ # fdb nexthop group can only contain fdb nexthops
+ run_cmd "$IP nexthop add id 14 via 172.16.1.2"
+ run_cmd "$IP nexthop add id 15 via 172.16.1.3"
+ run_cmd "$IP nexthop add id 103 group 14/15 fdb"
+ log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+ # Non fdb nexthop group can not contain fdb nexthops
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb"
+ run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb"
+ run_cmd "$IP nexthop add id 104 group 14/15"
+ log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+ # fdb nexthop cannot have blackhole
+ run_cmd "$IP nexthop add id 18 blackhole fdb"
+ log_test $? 2 "Fdb Nexthop with blackhole"
+
+ # fdb nexthop with oif
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with oif"
+
+ # fdb nexthop with onlink
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb"
+ log_test $? 2 "Fdb Nexthop with onlink"
+
+ # fdb nexthop with encap
+ run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with encap"
+
+ run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+ log_test $? 0 "Fdb mac add with nexthop group"
+
+ # fdb nexthops can only reference nexthop groups and not nexthops
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self"
+ log_test $? 255 "Fdb mac add with nexthop"
+
+ run_cmd "$IP ro add 172.16.0.0/22 nhid 15"
+ log_test $? 2 "Route add with fdb nexthop"
+
+ run_cmd "$IP ro add 172.16.0.0/22 nhid 103"
+ log_test $? 2 "Route add with fdb nexthop group"
+
+ run_cmd "$IP nexthop del id 102"
+ log_test $? 0 "Fdb nexthop delete"
+
+ $IP link del dev vx10
+}
+
################################################################################
# basic operations (add, delete, replace) on nexthops and nexthop groups
#