]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/commitdiff
Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next...
authorDavid S. Miller <davem@davemloft.net>
Sat, 23 May 2020 23:51:26 +0000 (16:51 -0700)
committerDavid S. Miller <davem@davemloft.net>
Sat, 23 May 2020 23:51:26 +0000 (16:51 -0700)
Jeff Kirsher says:

====================
100GbE Intel Wired LAN Driver Updates 2020-05-22

This series contains updates to virtchnl and the ice driver.

Geert Uytterhoeven fixes a data structure alignment issue in the
virtchnl structures.

Henry adds Flow Director support which allows for the redirection on
ntuple rules over six patches.  Initially Henry adds the initial
infrastructure for Flow Director, and then later adds IPv4 and IPv6
support, as well as being able to display the ntuple rules.

Bret add Accelerated Receive Flow Steering (aRFS) support which is used
to steer receive flows to a specific queue.  Fixes a transmit timeout
when the VF link transitions from up/down/up because the transmit and
receive queue interrupts are not enabled as part of VF's link up.  Fixed
an issue when the default VF LAN address is changed and after reset the
PF will attempt to add the new MAC, which fails because it already
exists. This causes the VF to be disabled completely until it is removed
and enabled via sysfs.

Anirudh (Ani) makes a fix where the ice driver needs to call set_mac_cfg
to enable jumbo frames, so ensure it gets called during initialization
and after reset.  Fix bad register reads during a register dump in
ethtool by removing the bad registers.

Paul fixes an issue where the receive Malicious Driver Detection (MDD)
auto reset message was not being logged because it occurred after the VF
reset.

Victor adds a check for compatibility between the Dynamic Device
Personalization (DDP) package and the NIC firmware to ensure that
everything aligns.

Jesse fixes a administrative queue string call with the appropriate
error reporting variable.  Also fixed the loop variables that are
comparing or assigning signed against unsigned values.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
213 files changed:
Documentation/bpf/bpf_devel_QA.rst
Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt [deleted file]
Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml [new file with mode: 0644]
MAINTAINERS
arch/arm64/boot/dts/mediatek/mt8516.dtsi
arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi
drivers/net/ethernet/amazon/ena/ena_admin_defs.h
drivers/net/ethernet/amazon/ena/ena_com.c
drivers/net/ethernet/amazon/ena/ena_com.h
drivers/net/ethernet/amazon/ena/ena_common_defs.h
drivers/net/ethernet/amazon/ena/ena_eth_com.c
drivers/net/ethernet/amazon/ena/ena_eth_com.h
drivers/net/ethernet/amazon/ena/ena_eth_io_defs.h
drivers/net/ethernet/amazon/ena/ena_ethtool.c
drivers/net/ethernet/amazon/ena/ena_netdev.c
drivers/net/ethernet/amazon/ena/ena_netdev.h
drivers/net/ethernet/amazon/ena/ena_regs_defs.h
drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
drivers/net/ethernet/aquantia/atlantic/aq_filters.c
drivers/net/ethernet/aquantia/atlantic/aq_hw.h
drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c
drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h
drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
drivers/net/ethernet/aquantia/atlantic/aq_main.c
drivers/net/ethernet/aquantia/atlantic/aq_nic.c
drivers/net/ethernet/aquantia/atlantic/aq_nic.h
drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
drivers/net/ethernet/aquantia/atlantic/aq_ptp.c
drivers/net/ethernet/aquantia/atlantic/aq_ring.c
drivers/net/ethernet/aquantia/atlantic/aq_vec.c
drivers/net/ethernet/aquantia/atlantic/aq_vec.h
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.h
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h
drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c
drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_internal.h
drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c
drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h
drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
drivers/net/ethernet/intel/e1000/e1000_main.c
drivers/net/ethernet/intel/e1000e/ich8lan.c
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/i40e/i40e_txrx.h
drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
drivers/net/ethernet/intel/i40e/i40e_type.h
drivers/net/ethernet/intel/i40e/i40e_xsk.c
drivers/net/ethernet/intel/i40e/i40e_xsk.h
drivers/net/ethernet/intel/ice/ice_base.c
drivers/net/ethernet/intel/ice/ice_txrx.h
drivers/net/ethernet/intel/ice/ice_xsk.c
drivers/net/ethernet/intel/ice/ice_xsk.h
drivers/net/ethernet/intel/igb/igb_ethtool.c
drivers/net/ethernet/intel/igc/igc.h
drivers/net/ethernet/intel/igc/igc_defines.h
drivers/net/ethernet/intel/igc/igc_ethtool.c
drivers/net/ethernet/intel/igc/igc_mac.c
drivers/net/ethernet/intel/igc/igc_main.c
drivers/net/ethernet/intel/igc/igc_regs.h
drivers/net/ethernet/intel/ixgbe/ixgbe.h
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/mediatek/Kconfig
drivers/net/ethernet/mediatek/Makefile
drivers/net/ethernet/mediatek/mtk_star_emac.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/Kconfig
drivers/net/ethernet/mellanox/mlx5/core/Makefile
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en/params.c
drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
drivers/net/ethernet/mellanox/mlx5/core/esw/chains.h
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
drivers/net/hyperv/netvsc_bpf.c
drivers/net/phy/at803x.c
drivers/net/phy/dp83869.c
drivers/net/vxlan.c
include/linux/bpf-cgroup.h
include/net/bareudp.h
include/net/flow_offload.h
include/net/ip6_fib.h
include/net/netns/nexthop.h
include/net/nexthop.h
include/net/switchdev.h
include/net/vxlan.h
include/net/xdp.h
include/net/xdp_sock.h
include/net/xdp_sock_drv.h [new file with mode: 0644]
include/net/xsk_buff_pool.h [new file with mode: 0644]
include/trace/events/xdp.h
include/uapi/linux/bpf.h
include/uapi/linux/neighbour.h
include/uapi/linux/nexthop.h
kernel/bpf/Makefile
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/bpf/xskmap.c [deleted file]
net/bpf/test_run.c
net/bridge/br_mrp.c
net/core/filter.c
net/core/flow_offload.c
net/core/neighbour.c
net/core/xdp.c
net/ethtool/channels.c
net/ethtool/ioctl.c
net/ipv4/af_inet.c
net/ipv4/nexthop.c
net/ipv6/af_inet6.c
net/ipv6/ip6_tunnel.c
net/ipv6/route.c
net/ipv6/tunnel6.c
net/mpls/af_mpls.c
net/psample/psample.c
net/xdp/Makefile
net/xdp/xdp_umem.c
net/xdp/xdp_umem.h
net/xdp/xsk.c
net/xdp/xsk.h
net/xdp/xsk_buff_pool.c [new file with mode: 0644]
net/xdp/xsk_diag.c
net/xdp/xsk_queue.c
net/xdp/xsk_queue.h
net/xdp/xskmap.c [new file with mode: 0644]
samples/bpf/.gitignore
samples/bpf/Makefile
samples/bpf/sampleip_kern.c
samples/bpf/sampleip_user.c
samples/bpf/sockex3_kern.c
samples/bpf/sockex3_user.c
samples/bpf/trace_common.h [new file with mode: 0644]
samples/bpf/trace_event_kern.c
samples/bpf/trace_event_user.c
samples/bpf/tracex1_user.c
samples/bpf/tracex2_kern.c
samples/bpf/tracex2_user.c
samples/bpf/tracex3_kern.c
samples/bpf/tracex3_user.c
samples/bpf/tracex4_kern.c
samples/bpf/tracex4_user.c
samples/bpf/tracex5_kern.c
samples/bpf/tracex5_user.c
samples/bpf/tracex6_kern.c
samples/bpf/tracex6_user.c
samples/bpf/tracex7_user.c
samples/bpf/xdp_redirect_cpu_user.c
tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
tools/bpf/bpftool/Documentation/bpftool-prog.rst
tools/bpf/bpftool/bash-completion/bpftool
tools/bpf/bpftool/cgroup.c
tools/bpf/bpftool/main.h
tools/bpf/bpftool/prog.c
tools/include/uapi/linux/bpf.h
tools/lib/bpf/hashmap.c
tools/lib/bpf/hashmap.h
tools/lib/bpf/libbpf.c
tools/testing/selftests/bpf/README.rst
tools/testing/selftests/bpf/config
tools/testing/selftests/bpf/network_helpers.c
tools/testing/selftests/bpf/network_helpers.h
tools/testing/selftests/bpf/prog_tests/align.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/connect_force_port.c
tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
tools/testing/selftests/bpf/progs/bpf_iter_task.c
tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
tools/testing/selftests/bpf/progs/connect_force_port4.c
tools/testing/selftests/bpf/progs/connect_force_port6.c
tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c
tools/testing/selftests/bpf/progs/test_sockmap_kern.h [new file with mode: 0644]
tools/testing/selftests/bpf/test_align.c [deleted file]
tools/testing/selftests/bpf/test_sockmap.c
tools/testing/selftests/bpf/test_sockmap_kern.h [deleted file]
tools/testing/selftests/bpf/verifier/ref_tracking.c
tools/testing/selftests/bpf/verifier/value_or_null.c
tools/testing/selftests/net/fib_nexthops.sh

index 38c15c6fcb144b8d3b1a6ba33437a4c7a909ca84..0b3db91dc10029f25e3b7c74f1b471899ee4310e 100644 (file)
@@ -437,6 +437,21 @@ needed::
 See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
 document for further documentation.
 
+To maximize the number of tests passing, the .config of the kernel
+under test should match the config file fragment in
+tools/testing/selftests/bpf as closely as possible.
+
+Finally to ensure support for latest BPF Type Format features -
+discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16
+is required for kernels built with CONFIG_DEBUG_INFO_BTF=y.
+pahole is delivered in the dwarves package or can be built
+from source at
+
+https://github.com/acmel/dwarves
+
+Some distros have pahole version 1.16 packaged already, e.g.
+Fedora, Gentoo.
+
 Q: Which BPF kernel selftests version should I run my kernel against?
 ---------------------------------------------------------------------
 A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.txt
deleted file mode 100644 (file)
index ecf027a..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-Mediatek pericfg controller
-===========================
-
-The Mediatek pericfg controller provides various clocks and reset
-outputs to the system.
-
-Required Properties:
-
-- compatible: Should be one of:
-       - "mediatek,mt2701-pericfg", "syscon"
-       - "mediatek,mt2712-pericfg", "syscon"
-       - "mediatek,mt7622-pericfg", "syscon"
-       - "mediatek,mt7623-pericfg", "mediatek,mt2701-pericfg", "syscon"
-       - "mediatek,mt7629-pericfg", "syscon"
-       - "mediatek,mt8135-pericfg", "syscon"
-       - "mediatek,mt8173-pericfg", "syscon"
-       - "mediatek,mt8183-pericfg", "syscon"
-- #clock-cells: Must be 1
-- #reset-cells: Must be 1
-
-The pericfg controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-Also it uses the common reset controller binding from
-Documentation/devicetree/bindings/reset/reset.txt.
-The available reset outputs are defined in
-dt-bindings/reset/mt*-resets.h
-
-Example:
-
-pericfg: power-controller@10003000 {
-       compatible = "mediatek,mt8173-pericfg", "syscon";
-       reg = <0 0x10003000 0 0x1000>;
-       #clock-cells = <1>;
-       #reset-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml
new file mode 100644 (file)
index 0000000..55209a2
--- /dev/null
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/mediatek/mediatek,pericfg.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: MediaTek Peripheral Configuration Controller
+
+maintainers:
+  - Bartosz Golaszewski <bgolaszewski@baylibre.com>
+
+description:
+  The Mediatek pericfg controller provides various clocks and reset outputs
+  to the system.
+
+properties:
+  compatible:
+    oneOf:
+      - items:
+        - enum:
+          - mediatek,mt2701-pericfg
+          - mediatek,mt2712-pericfg
+          - mediatek,mt7622-pericfg
+          - mediatek,mt7629-pericfg
+          - mediatek,mt8135-pericfg
+          - mediatek,mt8173-pericfg
+          - mediatek,mt8183-pericfg
+          - mediatek,mt8516-pericfg
+        - const: syscon
+      - items:
+        # Special case for mt7623 for backward compatibility
+        - const: mediatek,mt7623-pericfg
+        - const: mediatek,mt2701-pericfg
+        - const: syscon
+
+  reg:
+    maxItems: 1
+
+  '#clock-cells':
+    const: 1
+
+  '#reset-cells':
+    const: 1
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+    pericfg@10003000 {
+        compatible = "mediatek,mt8173-pericfg", "syscon";
+        reg = <0x10003000 0x1000>;
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+    };
+
+  - |
+    pericfg@10003000 {
+        compatible =  "mediatek,mt7623-pericfg", "mediatek,mt2701-pericfg", "syscon";
+        reg = <0x10003000 0x1000>;
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+    };
diff --git a/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml b/Documentation/devicetree/bindings/net/mediatek,eth-mac.yaml
new file mode 100644 (file)
index 0000000..f85d91a
--- /dev/null
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/mediatek,eth-mac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek STAR Ethernet MAC Controller
+
+maintainers:
+  - Bartosz Golaszewski <bgolaszewski@baylibre.com>
+
+description:
+  This Ethernet MAC is used on the MT8* family of SoCs from MediaTek.
+  It's compliant with 802.3 standards and supports half- and full-duplex
+  modes with flow-control as well as CRC offloading and VLAN tags.
+
+allOf:
+  - $ref: "ethernet-controller.yaml#"
+
+properties:
+  compatible:
+    enum:
+      - mediatek,mt8516-eth
+      - mediatek,mt8518-eth
+      - mediatek,mt8175-eth
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    minItems: 3
+    maxItems: 3
+
+  clock-names:
+    additionalItems: false
+    items:
+      - const: core
+      - const: reg
+      - const: trans
+
+  mediatek,pericfg:
+    $ref: /schemas/types.yaml#definitions/phandle
+    description:
+      Phandle to the device containing the PERICFG register range. This is used
+      to control the MII mode.
+
+  mdio:
+    type: object
+    description:
+      Creates and registers an MDIO bus.
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+  - mediatek,pericfg
+  - phy-handle
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8516-clk.h>
+
+    ethernet: ethernet@11180000 {
+        compatible = "mediatek,mt8516-eth";
+        reg = <0x11180000 0x1000>;
+        mediatek,pericfg = <&pericfg>;
+        interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>;
+        clocks = <&topckgen CLK_TOP_RG_ETH>,
+                 <&topckgen CLK_TOP_66M_ETH>,
+                 <&topckgen CLK_TOP_133M_ETH>;
+        clock-names = "core", "reg", "trans";
+        phy-handle = <&eth_phy>;
+        phy-mode = "rmii";
+
+        mdio {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            eth_phy: ethernet-phy@0 {
+                reg = <0>;
+            };
+        };
+    };
index b7844f6cfa4aa04f84020130b67a6e40ea1436ae..087e68b21f9f714bff2cad515613666039e768d4 100644 (file)
@@ -18443,8 +18443,12 @@ R:     Jonathan Lemon <jonathan.lemon@gmail.com>
 L:     netdev@vger.kernel.org
 L:     bpf@vger.kernel.org
 S:     Maintained
-F:     kernel/bpf/xskmap.c
+F:     include/net/xdp_sock*
+F:     include/net/xsk_buffer_pool.h
+F:     include/uapi/linux/if_xdp.h
 F:     net/xdp/
+F:     samples/bpf/xdpsock*
+F:     tools/lib/bpf/xsk*
 
 XEN BLOCK SUBSYSTEM
 M:     Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
index 2f8adf042195bfdabfdbb5c7f609ab5a5c44deab..89af661e7f6314b412cf457fe7e2ef4c70ea3475 100644 (file)
                        #clock-cells = <1>;
                };
 
+               pericfg: pericfg@10003050 {
+                       compatible = "mediatek,mt8516-pericfg", "syscon";
+                       reg = <0 0x10003050 0 0x1000>;
+               };
+
                apmixedsys: apmixedsys@10018000 {
                        compatible = "mediatek,mt8516-apmixedsys", "syscon";
                        reg = <0 0x10018000 0 0x710>;
                        status = "disabled";
                };
 
+               ethernet: ethernet@11180000 {
+                       compatible = "mediatek,mt8516-eth";
+                       reg = <0 0x11180000 0 0x1000>;
+                       mediatek,pericfg = <&pericfg>;
+                       interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>;
+                       clocks = <&topckgen CLK_TOP_RG_ETH>,
+                                <&topckgen CLK_TOP_66M_ETH>,
+                                <&topckgen CLK_TOP_133M_ETH>;
+                       clock-names = "core", "reg", "trans";
+                       status = "disabled";
+               };
+
                rng: rng@1020c000 {
                        compatible = "mediatek,mt8516-rng",
                                     "mediatek,mt7623-rng";
index a31093d7142bca128f66152e9e8e58953f0b936e..dfceffe6950a4ca8888c61c3cc76b25d9503da44 100644 (file)
@@ -9,6 +9,7 @@
 / {
        aliases {
                serial0 = &uart0;
+               ethernet0 = &ethernet;
        };
 
        chosen {
        status = "okay";
 };
 
+&ethernet {
+       pinctrl-names = "default";
+       pinctrl-0 = <&ethernet_pins_default>;
+       phy-handle = <&eth_phy>;
+       phy-mode = "rmii";
+       mac-address = [00 00 00 00 00 00];
+       status = "okay";
+
+       mdio {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               eth_phy: ethernet-phy@0 {
+                       reg = <0>;
+               };
+       };
+};
+
 &usb0 {
        status = "okay";
        dr_mode = "peripheral";
                        bias-pull-up;
                };
        };
+
+       ethernet_pins_default: ethernet {
+               pins_ethernet {
+                       pinmux = <MT8516_PIN_0_EINT0__FUNC_EXT_TXD0>,
+                                <MT8516_PIN_1_EINT1__FUNC_EXT_TXD1>,
+                                <MT8516_PIN_5_EINT5__FUNC_EXT_RXER>,
+                                <MT8516_PIN_6_EINT6__FUNC_EXT_RXC>,
+                                <MT8516_PIN_7_EINT7__FUNC_EXT_RXDV>,
+                                <MT8516_PIN_8_EINT8__FUNC_EXT_RXD0>,
+                                <MT8516_PIN_9_EINT9__FUNC_EXT_RXD1>,
+                                <MT8516_PIN_12_EINT12__FUNC_EXT_TXEN>,
+                                <MT8516_PIN_38_MRG_DI__FUNC_EXT_MDIO>,
+                                <MT8516_PIN_39_MRG_DO__FUNC_EXT_MDC>;
+               };
+       };
 };
index 7be3dcbf3d16b25abc9867457517e7e27b8ff8fa..336742f6e3c35e0711e4010d0eaa82b4978899e9 100644 (file)
@@ -768,8 +768,8 @@ enum ena_admin_os_type {
        ENA_ADMIN_OS_DPDK                           = 3,
        ENA_ADMIN_OS_FREEBSD                        = 4,
        ENA_ADMIN_OS_IPXE                           = 5,
-       ENA_ADMIN_OS_ESXI                           = 6,
-       ENA_ADMIN_OS_GROUPS_NUM                     = 6,
+       ENA_ADMIN_OS_ESXI                           = 6,
+       ENA_ADMIN_OS_GROUPS_NUM                     = 6,
 };
 
 struct ena_admin_host_info {
@@ -813,7 +813,8 @@ struct ena_admin_host_info {
 
        u16 reserved;
 
-       /* 1 :0 : reserved
+       /* 0 : reserved
+        * 1 : rx_offset
         * 2 : interrupt_moderation
         * 31:3 : reserved
         */
@@ -1124,6 +1125,8 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 #define ENA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
 #define ENA_ADMIN_HOST_INFO_BUS_SHIFT                       8
 #define ENA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_SHIFT                 1
+#define ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK                  BIT(1)
 #define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_SHIFT      2
 #define ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK       BIT(2)
 
@@ -1133,4 +1136,4 @@ struct ena_admin_ena_mmio_req_read_less_resp {
 /* aenq_link_change_desc */
 #define ENA_ADMIN_AENQ_LINK_CHANGE_DESC_LINK_STATUS_MASK    BIT(0)
 
-#endif /*_ENA_ADMIN_H_ */
+#endif /* _ENA_ADMIN_H_ */
index b51bf62af11bd3facee9c125c8ab3a7df58217a8..432f143559a1240d4ad942de831d7e4520629b4f 100644 (file)
@@ -62,7 +62,9 @@
 
 #define ENA_REGS_ADMIN_INTR_MASK 1
 
-#define ENA_POLL_MS    5
+#define ENA_MIN_ADMIN_POLL_US 100
+
+#define ENA_MAX_ADMIN_POLL_US 5000
 
 /*****************************************************************************/
 /*****************************************************************************/
@@ -200,17 +202,17 @@ static void comp_ctxt_release(struct ena_com_admin_queue *queue,
 static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue,
                                          u16 command_id, bool capture)
 {
-       if (unlikely(!queue->comp_ctx)) {
-               pr_err("Completion context is NULL\n");
-               return NULL;
-       }
-
        if (unlikely(command_id >= queue->q_depth)) {
                pr_err("command id is larger than the queue size. cmd_id: %u queue size %d\n",
                       command_id, queue->q_depth);
                return NULL;
        }
 
+       if (unlikely(!queue->comp_ctx)) {
+               pr_err("Completion context is NULL\n");
+               return NULL;
+       }
+
        if (unlikely(queue->comp_ctx[command_id].occupied && capture)) {
                pr_err("Completion context is occupied\n");
                return NULL;
@@ -375,7 +377,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
                io_sq->bounce_buf_ctrl.next_to_use = 0;
 
                size = io_sq->bounce_buf_ctrl.buffer_size *
-                        io_sq->bounce_buf_ctrl.buffers_num;
+                       io_sq->bounce_buf_ctrl.buffers_num;
 
                dev_node = dev_to_node(ena_dev->dmadev);
                set_dev_node(ena_dev->dmadev, ctx->numa_node);
@@ -523,9 +525,6 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
        if (unlikely(comp_status != 0))
                pr_err("admin command failed[%u]\n", comp_status);
 
-       if (unlikely(comp_status > ENA_ADMIN_UNKNOWN_ERROR))
-               return -EINVAL;
-
        switch (comp_status) {
        case ENA_ADMIN_SUCCESS:
                return 0;
@@ -540,7 +539,14 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
                return -EINVAL;
        }
 
-       return 0;
+       return -EINVAL;
+}
+
+static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us)
+{
+       delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us);
+       delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US);
+       usleep_range(delay_us, 2 * delay_us);
 }
 
 static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
@@ -549,6 +555,7 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
        unsigned long flags = 0;
        unsigned long timeout;
        int ret;
+       u32 exp = 0;
 
        timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
 
@@ -572,7 +579,8 @@ static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_c
                        goto err;
                }
 
-               msleep(ENA_POLL_MS);
+               ena_delay_exponential_backoff_us(exp++,
+                                                admin_queue->ena_dev->ena_min_poll_delay_us);
        }
 
        if (unlikely(comp_ctx->status == ENA_CMD_ABORTED)) {
@@ -702,8 +710,7 @@ static int ena_com_config_llq_info(struct ena_com_dev *ena_dev,
                /* The desc list entry size should be whole multiply of 8
                 * This requirement comes from __iowrite64_copy()
                 */
-               pr_err("illegal entry size %d\n",
-                      llq_info->desc_list_entry_size);
+               pr_err("illegal entry size %d\n", llq_info->desc_list_entry_size);
                return -EINVAL;
        }
 
@@ -775,7 +782,7 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
                        if (admin_queue->auto_polling)
                                admin_queue->polling = true;
                } else {
-                       pr_err("The ena device doesn't send a completion for the admin cmd %d status %d\n",
+                       pr_err("The ena device didn't send a completion for the admin cmd %d status %d\n",
                               comp_ctx->cmd_opcode, comp_ctx->status);
                }
                /* Check if shifted to polling mode.
@@ -943,12 +950,13 @@ static void ena_com_io_queue_free(struct ena_com_dev *ena_dev,
 static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
                                u16 exp_state)
 {
-       u32 val, i;
+       u32 val, exp = 0;
+       unsigned long timeout_stamp;
 
-       /* Convert timeout from resolution of 100ms to ENA_POLL_MS */
-       timeout = (timeout * 100) / ENA_POLL_MS;
+       /* Convert timeout from resolution of 100ms to us resolution. */
+       timeout_stamp = jiffies + usecs_to_jiffies(100 * 1000 * timeout);
 
-       for (i = 0; i < timeout; i++) {
+       while (1) {
                val = ena_com_reg_bar_read32(ena_dev, ENA_REGS_DEV_STS_OFF);
 
                if (unlikely(val == ENA_MMIO_READ_TIMEOUT)) {
@@ -960,10 +968,11 @@ static int wait_for_reset_state(struct ena_com_dev *ena_dev, u32 timeout,
                        exp_state)
                        return 0;
 
-               msleep(ENA_POLL_MS);
-       }
+               if (time_is_before_jiffies(timeout_stamp))
+                       return -ETIME;
 
-       return -ETIME;
+               ena_delay_exponential_backoff_us(exp++, ena_dev->ena_min_poll_delay_us);
+       }
 }
 
 static bool ena_com_check_supported_feature_id(struct ena_com_dev *ena_dev,
@@ -1284,13 +1293,9 @@ static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev)
 static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev,
                                                 u16 intr_delay_resolution)
 {
-       /* Initial value of intr_delay_resolution might be 0 */
-       u16 prev_intr_delay_resolution =
-               ena_dev->intr_delay_resolution ?
-               ena_dev->intr_delay_resolution :
-               ENA_DEFAULT_INTR_DELAY_RESOLUTION;
+       u16 prev_intr_delay_resolution = ena_dev->intr_delay_resolution;
 
-       if (!intr_delay_resolution) {
+       if (unlikely(!intr_delay_resolution)) {
                pr_err("Illegal intr_delay_resolution provided. Going to use default 1 usec resolution\n");
                intr_delay_resolution = ENA_DEFAULT_INTR_DELAY_RESOLUTION;
        }
@@ -1444,11 +1449,13 @@ void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev)
 {
        struct ena_com_admin_queue *admin_queue = &ena_dev->admin_queue;
        unsigned long flags = 0;
+       u32 exp = 0;
 
        spin_lock_irqsave(&admin_queue->q_lock, flags);
        while (atomic_read(&admin_queue->outstanding_cmds) != 0) {
                spin_unlock_irqrestore(&admin_queue->q_lock, flags);
-               msleep(ENA_POLL_MS);
+               ena_delay_exponential_backoff_us(exp++,
+                                                ena_dev->ena_min_poll_delay_us);
                spin_lock_irqsave(&admin_queue->q_lock, flags);
        }
        spin_unlock_irqrestore(&admin_queue->q_lock, flags);
@@ -1796,6 +1803,7 @@ int ena_com_admin_init(struct ena_com_dev *ena_dev,
        if (ret)
                goto error;
 
+       admin_queue->ena_dev = ena_dev;
        admin_queue->running_state = true;
 
        return 0;
@@ -2003,7 +2011,7 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
        struct ena_admin_aenq_entry *aenq_e;
        struct ena_admin_aenq_common_desc *aenq_common;
        struct ena_com_aenq *aenq  = &dev->aenq;
-       unsigned long long timestamp;
+       u64 timestamp;
        ena_aenq_handler handler_cb;
        u16 masked_head, processed = 0;
        u8 phase;
@@ -2021,9 +2029,8 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
                 */
                dma_rmb();
 
-               timestamp =
-                       (unsigned long long)aenq_common->timestamp_low |
-                       ((unsigned long long)aenq_common->timestamp_high << 32);
+               timestamp = (u64)aenq_common->timestamp_low |
+                           ((u64)aenq_common->timestamp_high << 32);
                pr_debug("AENQ! Group[%x] Syndrom[%x] timestamp: [%llus]\n",
                         aenq_common->group, aenq_common->syndrom, timestamp);
 
@@ -2053,8 +2060,7 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
 
        /* write the aenq doorbell after all AENQ descriptors were read */
        mb();
-       writel_relaxed((u32)aenq->head,
-                      dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
+       writel_relaxed((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
 }
 
 int ena_com_dev_reset(struct ena_com_dev *ena_dev,
@@ -2276,13 +2282,14 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
                               enum ena_admin_hash_functions func,
                               const u8 *key, u16 key_len, u32 init_val)
 {
-       struct ena_rss *rss = &ena_dev->rss;
+       struct ena_admin_feature_rss_flow_hash_control *hash_key;
        struct ena_admin_get_feat_resp get_resp;
-       struct ena_admin_feature_rss_flow_hash_control *hash_key =
-               rss->hash_key;
        enum ena_admin_hash_functions old_func;
+       struct ena_rss *rss = &ena_dev->rss;
        int rc;
 
+       hash_key = rss->hash_key;
+
        /* Make sure size is a mult of DWs */
        if (unlikely(key_len & 0x3))
                return -EINVAL;
@@ -2294,7 +2301,7 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev,
        if (unlikely(rc))
                return rc;
 
-       if (!((1 << func) & get_resp.u.flow_hash_func.supported_func)) {
+       if (!(BIT(func) & get_resp.u.flow_hash_func.supported_func)) {
                pr_err("Flow hash function %d isn't supported\n", func);
                return -EOPNOTSUPP;
        }
index 13a1b7812c46d4706336fd8463b802e92dc834fe..bc187adf54e4b7124e98941092b2315aaf7f4bb1 100644 (file)
@@ -77,6 +77,8 @@
 #define ENA_INTR_INITIAL_RX_INTERVAL_USECS 0
 #define ENA_DEFAULT_INTR_DELAY_RESOLUTION 1
 
+#define ENA_HASH_KEY_SIZE 40
+
 #define ENA_HW_HINTS_NO_TIMEOUT        0xFFFF
 
 #define ENA_FEATURE_MAX_QUEUE_EXT_VER 1
@@ -237,6 +239,7 @@ struct ena_com_stats_admin {
 
 struct ena_com_admin_queue {
        void *q_dmadev;
+       struct ena_com_dev *ena_dev;
        spinlock_t q_lock; /* spinlock for the admin queue */
 
        struct ena_comp_ctx *comp_ctx;
@@ -349,6 +352,8 @@ struct ena_com_dev {
        struct ena_intr_moder_entry *intr_moder_tbl;
 
        struct ena_com_llq_info llq_info;
+
+       u32 ena_min_poll_delay_us;
 };
 
 struct ena_com_dev_get_features_ctx {
@@ -393,7 +398,7 @@ struct ena_aenq_handlers {
  */
 int ena_com_mmio_reg_read_request_init(struct ena_com_dev *ena_dev);
 
-/* ena_com_set_mmio_read_mode - Enable/disable the mmio reg read mechanism
+/* ena_com_set_mmio_read_mode - Enable/disable the indirect mmio reg read mechanism
  * @ena_dev: ENA communication layer struct
  * @readless_supported: readless mode (enable/disable)
  */
@@ -515,7 +520,7 @@ void ena_com_set_admin_auto_polling_mode(struct ena_com_dev *ena_dev,
 /* ena_com_admin_q_comp_intr_handler - admin queue interrupt handler
  * @ena_dev: ENA communication layer struct
  *
- * This method go over the admin completion queue and wake up all the pending
+ * This method goes over the admin completion queue and wakes up all the pending
  * threads that wait on the commands wait event.
  *
  * @note: Should be called after MSI-X interrupt.
@@ -525,7 +530,7 @@ void ena_com_admin_q_comp_intr_handler(struct ena_com_dev *ena_dev);
 /* ena_com_aenq_intr_handler - AENQ interrupt handler
  * @ena_dev: ENA communication layer struct
  *
- * This method go over the async event notification queue and call the proper
+ * This method goes over the async event notification queue and calls the proper
  * aenq handler.
  */
 void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data);
@@ -542,14 +547,14 @@ void ena_com_abort_admin_commands(struct ena_com_dev *ena_dev);
 /* ena_com_wait_for_abort_completion - Wait for admin commands abort.
  * @ena_dev: ENA communication layer struct
  *
- * This method wait until all the outstanding admin commands will be completed.
+ * This method waits until all the outstanding admin commands are completed.
  */
 void ena_com_wait_for_abort_completion(struct ena_com_dev *ena_dev);
 
 /* ena_com_validate_version - Validate the device parameters
  * @ena_dev: ENA communication layer struct
  *
- * This method validate the device parameters are the same as the saved
+ * This method verifies the device parameters are the same as the saved
  * parameters in ena_dev.
  * This method is useful after device reset, to validate the device mac address
  * and the device offloads are the same as before the reset.
@@ -689,7 +694,7 @@ int ena_com_set_hash_function(struct ena_com_dev *ena_dev);
  *
  * Retrieve the hash function from the device.
  *
- * @note: If the caller called ena_com_fill_hash_function but didn't flash
+ * @note: If the caller called ena_com_fill_hash_function but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -703,7 +708,7 @@ int ena_com_get_hash_function(struct ena_com_dev *ena_dev,
  *
  * Retrieve the hash key.
  *
- * @note: If the caller called ena_com_fill_hash_key but didn't flash
+ * @note: If the caller called ena_com_fill_hash_key but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -743,7 +748,7 @@ int ena_com_set_hash_ctrl(struct ena_com_dev *ena_dev);
  *
  * Retrieve the hash control from the device.
  *
- * @note, If the caller called ena_com_fill_hash_ctrl but didn't flash
+ * @note: If the caller called ena_com_fill_hash_ctrl but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -795,7 +800,7 @@ int ena_com_indirect_table_set(struct ena_com_dev *ena_dev);
  *
  * Retrieve the RSS indirection table from the device.
  *
- * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flash
+ * @note: If the caller called ena_com_indirect_table_fill_entry but didn't flush
  * it to the device, the new configuration will be lost.
  *
  * @return: 0 on Success and negative value otherwise.
@@ -821,14 +826,14 @@ int ena_com_allocate_debug_area(struct ena_com_dev *ena_dev,
 /* ena_com_delete_debug_area - Free the debug area resources.
  * @ena_dev: ENA communication layer struct
  *
- * Free the allocate debug area.
+ * Free the allocated debug area.
  */
 void ena_com_delete_debug_area(struct ena_com_dev *ena_dev);
 
 /* ena_com_delete_host_info - Free the host info resources.
  * @ena_dev: ENA communication layer struct
  *
- * Free the allocate host info.
+ * Free the allocated host info.
  */
 void ena_com_delete_host_info(struct ena_com_dev *ena_dev);
 
@@ -869,9 +874,9 @@ int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
  * @cmd_completion: command completion return value.
  * @cmd_comp_size: command completion size.
 
- * Submit an admin command and then wait until the device will return a
+ * Submit an admin command and then wait until the device returns a
  * completion.
- * The completion will be copyed into cmd_comp.
+ * The completion will be copied into cmd_comp.
  *
  * @return - 0 on success, negative value on failure.
  */
@@ -934,7 +939,7 @@ unsigned int ena_com_get_nonadaptive_moderation_interval_rx(struct ena_com_dev *
 /* ena_com_config_dev_mode - Configure the placement policy of the device.
  * @ena_dev: ENA communication layer struct
  * @llq_features: LLQ feature descriptor, retrieve via
- *                ena_com_get_dev_attr_feat.
+ *                ena_com_get_dev_attr_feat.
  * @ena_llq_config: The default driver LLQ parameters configurations
  */
 int ena_com_config_dev_mode(struct ena_com_dev *ena_dev,
@@ -960,7 +965,7 @@ static inline void ena_com_disable_adaptive_moderation(struct ena_com_dev *ena_d
  * @intr_reg: interrupt register to update.
  * @rx_delay_interval: Rx interval in usecs
  * @tx_delay_interval: Tx interval in usecs
- * @unmask: unask enable/disable
+ * @unmask: unmask enable/disable
  *
  * Prepare interrupt update register with the supplied parameters.
  */
index 23beb7e7ed7b3bca1084858df14eee6953a99c71..8a8ded0de9ac897e2927bd180494b5e65b46bc91 100644 (file)
@@ -45,4 +45,4 @@ struct ena_common_mem_addr {
        u16 reserved16;
 };
 
-#endif /*_ENA_COMMON_H_ */
+#endif /* _ENA_COMMON_H_ */
index 2845ac2777246e0dc8d9df26695ac4414d3923a8..ec8ea25e988de48f3615df9e07cac2ed50eaf279 100644 (file)
@@ -519,7 +519,7 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
        struct ena_eth_io_rx_cdesc_base *cdesc = NULL;
        u16 cdesc_idx = 0;
        u16 nb_hw_desc;
-       u16 i;
+       u16 i = 0;
 
        WARN(io_cq->direction != ENA_COM_IO_QUEUE_DIRECTION_RX, "wrong Q type");
 
@@ -538,13 +538,19 @@ int ena_com_rx_pkt(struct ena_com_io_cq *io_cq,
                return -ENOSPC;
        }
 
-       for (i = 0; i < nb_hw_desc; i++) {
+       cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx);
+       ena_rx_ctx->pkt_offset = cdesc->offset;
+
+       do {
+               ena_buf[i].len = cdesc->length;
+               ena_buf[i].req_id = cdesc->req_id;
+
+               if (++i >= nb_hw_desc)
+                       break;
+
                cdesc = ena_com_rx_cdesc_idx_to_ptr(io_cq, cdesc_idx + i);
 
-               ena_buf->len = cdesc->length;
-               ena_buf->req_id = cdesc->req_id;
-               ena_buf++;
-       }
+       } while (1);
 
        /* Update SQ head ptr */
        io_sq->next_to_comp += nb_hw_desc;
@@ -578,10 +584,10 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 
        desc->length = ena_buf->len;
 
-       desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK;
-       desc->ctrl |= ENA_ETH_IO_RX_DESC_LAST_MASK;
-       desc->ctrl |= io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK;
-       desc->ctrl |= ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
+       desc->ctrl = ENA_ETH_IO_RX_DESC_FIRST_MASK |
+               ENA_ETH_IO_RX_DESC_LAST_MASK |
+               (io_sq->phase & ENA_ETH_IO_RX_DESC_PHASE_MASK) |
+               ENA_ETH_IO_RX_DESC_COMP_REQ_MASK;
 
        desc->req_id = req_id;
 
index 77986c0ea52ca1873e299c54664540dbc30f53a5..8b1afd3b32f26c27987792a92219080a3981262b 100644 (file)
@@ -73,6 +73,7 @@ struct ena_com_rx_ctx {
        u32 hash;
        u16 descs;
        int max_bufs;
+       u8 pkt_offset;
 };
 
 int ena_com_prepare_tx(struct ena_com_io_sq *io_sq,
@@ -95,7 +96,7 @@ static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
        writel(intr_reg->intr_control, io_cq->unmask_reg);
 }
 
-static inline int ena_com_free_desc(struct ena_com_io_sq *io_sq)
+static inline int ena_com_free_q_entries(struct ena_com_io_sq *io_sq)
 {
        u16 tail, next_to_comp, cnt;
 
@@ -113,7 +114,7 @@ static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq,
        int temp;
 
        if (io_sq->mem_queue_type == ENA_ADMIN_PLACEMENT_POLICY_HOST)
-               return ena_com_free_desc(io_sq) >= required_buffers;
+               return ena_com_free_q_entries(io_sq) >= required_buffers;
 
        /* This calculation doesn't need to be 100% accurate. So to reduce
         * the calculation overhead just Subtract 2 lines from the free descs
@@ -122,7 +123,7 @@ static inline bool ena_com_sq_have_enough_space(struct ena_com_io_sq *io_sq,
         */
        temp = required_buffers / io_sq->llq_info.descs_per_entry + 2;
 
-       return ena_com_free_desc(io_sq) > temp;
+       return ena_com_free_q_entries(io_sq) > temp;
 }
 
 static inline bool ena_com_meta_desc_changed(struct ena_com_io_sq *io_sq,
index 00e0f056a741a7a2271802f59847bda96eb7703a..d105c9c56192df9447a764c04e3fd5b439e62dd3 100644 (file)
@@ -264,7 +264,9 @@ struct ena_eth_io_rx_cdesc_base {
 
        u16 sub_qid;
 
-       u16 reserved;
+       u8 offset;
+
+       u8 reserved;
 };
 
 /* 8-word format */
@@ -412,4 +414,4 @@ struct ena_eth_io_numa_node_cfg_reg {
 #define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_SHIFT          31
 #define ENA_ETH_IO_NUMA_NODE_CFG_REG_ENABLED_MASK           BIT(31)
 
-#endif /*_ENA_ETH_IO_H_ */
+#endif /* _ENA_ETH_IO_H_ */
index 830d3711d6eef117baf3894de3e1582072b573b5..e340b65af08c52a701bcedb497b45f7debee5675 100644 (file)
@@ -206,7 +206,7 @@ int ena_get_sset_count(struct net_device *netdev, int sset)
        if (sset != ETH_SS_STATS)
                return -EOPNOTSUPP;
 
-       return  adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
+       return adapter->num_io_queues * (ENA_STATS_ARRAY_TX + ENA_STATS_ARRAY_RX)
                + ENA_STATS_ARRAY_GLOBAL + ENA_STATS_ARRAY_ENA_COM;
 }
 
@@ -260,7 +260,6 @@ static void ena_get_strings(struct net_device *netdev, u32 sset, u8 *data)
 
        for (i = 0; i < ENA_STATS_ARRAY_GLOBAL; i++) {
                ena_stats = &ena_stats_global_strings[i];
-
                memcpy(data, ena_stats->name, ETH_GSTRING_LEN);
                data += ETH_GSTRING_LEN;
        }
@@ -307,10 +306,8 @@ static int ena_get_coalesce(struct net_device *net_dev,
        struct ena_adapter *adapter = netdev_priv(net_dev);
        struct ena_com_dev *ena_dev = adapter->ena_dev;
 
-       if (!ena_com_interrupt_moderation_supported(ena_dev)) {
-               /* the devie doesn't support interrupt moderation */
+       if (!ena_com_interrupt_moderation_supported(ena_dev))
                return -EOPNOTSUPP;
-       }
 
        coalesce->tx_coalesce_usecs =
                ena_com_get_nonadaptive_moderation_interval_tx(ena_dev) *
@@ -326,7 +323,7 @@ static int ena_get_coalesce(struct net_device *net_dev,
        return 0;
 }
 
-static void ena_update_tx_rings_intr_moderation(struct ena_adapter *adapter)
+static void ena_update_tx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
 {
        unsigned int val;
        int i;
@@ -337,7 +334,7 @@ static void ena_update_tx_rings_intr_moderation(struct ena_adapter *adapter)
                adapter->tx_ring[i].smoothed_interval = val;
 }
 
-static void ena_update_rx_rings_intr_moderation(struct ena_adapter *adapter)
+static void ena_update_rx_rings_nonadaptive_intr_moderation(struct ena_adapter *adapter)
 {
        unsigned int val;
        int i;
@@ -355,24 +352,22 @@ static int ena_set_coalesce(struct net_device *net_dev,
        struct ena_com_dev *ena_dev = adapter->ena_dev;
        int rc;
 
-       if (!ena_com_interrupt_moderation_supported(ena_dev)) {
-               /* the devie doesn't support interrupt moderation */
+       if (!ena_com_interrupt_moderation_supported(ena_dev))
                return -EOPNOTSUPP;
-       }
 
        rc = ena_com_update_nonadaptive_moderation_interval_tx(ena_dev,
                                                               coalesce->tx_coalesce_usecs);
        if (rc)
                return rc;
 
-       ena_update_tx_rings_intr_moderation(adapter);
+       ena_update_tx_rings_nonadaptive_intr_moderation(adapter);
 
        rc = ena_com_update_nonadaptive_moderation_interval_rx(ena_dev,
                                                               coalesce->rx_coalesce_usecs);
        if (rc)
                return rc;
 
-       ena_update_rx_rings_intr_moderation(adapter);
+       ena_update_rx_rings_nonadaptive_intr_moderation(adapter);
 
        if (coalesce->use_adaptive_rx_coalesce &&
            !ena_com_get_adaptive_moderation_enabled(ena_dev))
index 85b87ed02dd56fd2e0c05b375f2c4b8457779e7b..46865d5bd7e70d04848cd9afc085fc8ee444781d 100644 (file)
@@ -1435,6 +1435,8 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 
                skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_info->page,
                                rx_info->page_offset, len, ENA_PAGE_SIZE);
+               /* The offset is non zero only for the first buffer */
+               rx_info->page_offset = 0;
 
                netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
                          "rx skb updated. len %d. data_len %d\n",
@@ -1590,6 +1592,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 {
        u16 next_to_clean = rx_ring->next_to_clean;
        struct ena_com_rx_ctx ena_rx_ctx;
+       struct ena_rx_buffer *rx_info;
        struct ena_adapter *adapter;
        u32 res_budget, work_done;
        int rx_copybreak_pkt = 0;
@@ -1614,6 +1617,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
                ena_rx_ctx.ena_bufs = rx_ring->ena_bufs;
                ena_rx_ctx.max_bufs = rx_ring->sgl_size;
                ena_rx_ctx.descs = 0;
+               ena_rx_ctx.pkt_offset = 0;
                rc = ena_com_rx_pkt(rx_ring->ena_com_io_cq,
                                    rx_ring->ena_com_io_sq,
                                    &ena_rx_ctx);
@@ -1623,6 +1627,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
                if (unlikely(ena_rx_ctx.descs == 0))
                        break;
 
+               rx_info = &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id];
+               rx_info->page_offset = ena_rx_ctx.pkt_offset;
+
                netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
                          "rx_poll: q %d got packet from ena. descs #: %d l3 proto %d l4 proto %d hash: %x\n",
                          rx_ring->qid, ena_rx_ctx.descs, ena_rx_ctx.l3_proto,
@@ -1684,7 +1691,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
 
        rx_ring->next_to_clean = next_to_clean;
 
-       refill_required = ena_com_free_desc(rx_ring->ena_com_io_sq);
+       refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
        refill_threshold =
                min_t(int, rx_ring->ring_size / ENA_RX_REFILL_THRESH_DIVIDER,
                      ENA_RX_REFILL_THRESH_PACKET);
@@ -2235,7 +2242,7 @@ static int ena_rss_configure(struct ena_adapter *adapter)
                rc = ena_rss_init_default(adapter);
                if (rc && (rc != -EOPNOTSUPP)) {
                        netif_err(adapter, ifup, adapter->netdev,
-                                       "Failed to init RSS rc: %d\n", rc);
+                                 "Failed to init RSS rc: %d\n", rc);
                        return rc;
                }
        }
@@ -2308,7 +2315,7 @@ static int ena_create_io_tx_queue(struct ena_adapter *adapter, int qid)
        if (rc) {
                netif_err(adapter, ifup, adapter->netdev,
                          "Failed to create I/O TX queue num %d rc: %d\n",
-                          qid, rc);
+                         qid, rc);
                return rc;
        }
 
@@ -2457,7 +2464,7 @@ static int create_queues_with_size_backoff(struct ena_adapter *adapter)
         * ones due to past queue allocation failures.
         */
        set_io_rings_size(adapter, adapter->requested_tx_ring_size,
-                       adapter->requested_rx_ring_size);
+                         adapter->requested_rx_ring_size);
 
        while (1) {
                if (ena_xdp_present(adapter)) {
@@ -2498,7 +2505,7 @@ err_setup_tx:
                if (rc != -ENOMEM) {
                        netif_err(adapter, ifup, adapter->netdev,
                                  "Queue creation failed with error code %d\n",
-                                  rc);
+                                 rc);
                        return rc;
                }
 
@@ -2521,7 +2528,7 @@ err_setup_tx:
                        new_rx_ring_size = cur_rx_ring_size / 2;
 
                if (new_tx_ring_size < ENA_MIN_RING_SIZE ||
-                               new_rx_ring_size < ENA_MIN_RING_SIZE) {
+                   new_rx_ring_size < ENA_MIN_RING_SIZE) {
                        netif_err(adapter, ifup, adapter->netdev,
                                  "Queue creation failed with the smallest possible queue size of %d for both queues. Not retrying with smaller queues\n",
                                  ENA_MIN_RING_SIZE);
@@ -3080,8 +3087,7 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb,
        return qid;
 }
 
-static void ena_config_host_info(struct ena_com_dev *ena_dev,
-                                struct pci_dev *pdev)
+static void ena_config_host_info(struct ena_com_dev *ena_dev, struct pci_dev *pdev)
 {
        struct ena_admin_host_info *host_info;
        int rc;
@@ -3111,6 +3117,7 @@ static void ena_config_host_info(struct ena_com_dev *ena_dev,
        host_info->num_cpus = num_online_cpus();
 
        host_info->driver_supported_features =
+               ENA_ADMIN_HOST_INFO_RX_OFFSET_MASK |
                ENA_ADMIN_HOST_INFO_INTERRUPT_MODERATION_MASK;
 
        rc = ena_com_set_host_attributes(ena_dev);
@@ -3686,8 +3693,7 @@ static void check_for_empty_rx_ring(struct ena_adapter *adapter)
        for (i = 0; i < adapter->num_io_queues; i++) {
                rx_ring = &adapter->rx_ring[i];
 
-               refill_required =
-                       ena_com_free_desc(rx_ring->ena_com_io_sq);
+               refill_required = ena_com_free_q_entries(rx_ring->ena_com_io_sq);
                if (unlikely(refill_required == (rx_ring->ring_size - 1))) {
                        rx_ring->empty_rx_queue++;
 
@@ -3825,11 +3831,11 @@ static void ena_timer_service(struct timer_list *t)
        mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 }
 
-static int ena_calc_max_io_queue_num(struct pci_dev *pdev,
+static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev,
                                     struct ena_com_dev *ena_dev,
                                     struct ena_com_dev_get_features_ctx *get_feat_ctx)
 {
-       int io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues;
+       u32 io_tx_sq_num, io_tx_cq_num, io_rx_num, max_num_io_queues;
 
        if (ena_dev->supported_features & BIT(ENA_ADMIN_MAX_QUEUES_EXT)) {
                struct ena_admin_queue_ext_feature_fields *max_queue_ext =
@@ -4115,8 +4121,8 @@ static int ena_calc_io_queue_size(struct ena_calc_queue_size_ctx *ctx)
  */
 static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-       struct ena_com_dev_get_features_ctx get_feat_ctx;
        struct ena_calc_queue_size_ctx calc_queue_ctx = { 0 };
+       struct ena_com_dev_get_features_ctx get_feat_ctx;
        struct ena_llq_configurations llq_config;
        struct ena_com_dev *ena_dev = NULL;
        struct ena_adapter *adapter;
@@ -4160,6 +4166,8 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                goto err_free_region;
        }
 
+       ena_dev->ena_min_poll_delay_us = ENA_ADMIN_POLL_DELAY_US;
+
        ena_dev->dmadev = &pdev->dev;
 
        rc = ena_device_init(ena_dev, pdev, &get_feat_ctx, &wd_state);
@@ -4183,7 +4191,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        calc_queue_ctx.get_feat_ctx = &get_feat_ctx;
        calc_queue_ctx.pdev = pdev;
 
-       /* Initial Tx and RX interrupt delay. Assumes 1 usec granularity.
+       /* Initial TX and RX interrupt delay. Assumes 1 usec granularity.
         * Updated during device initialization with the real granularity
         */
        ena_dev->intr_moder_tx_interval = ENA_INTR_INITIAL_TX_INTERVAL_USECS;
@@ -4227,12 +4235,11 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        adapter->num_io_queues = max_num_io_queues;
        adapter->max_num_io_queues = max_num_io_queues;
+       adapter->last_monitored_tx_qid = 0;
 
        adapter->xdp_first_ring = 0;
        adapter->xdp_num_queues = 0;
 
-       adapter->last_monitored_tx_qid = 0;
-
        adapter->rx_copybreak = ENA_DEFAULT_RX_COPYBREAK;
        adapter->wd_state = wd_state;
 
index 680099afcccf97e20a62aef7e62fca83d1266646..ba030d2609402c522d018436f9171410e6522509 100644 (file)
 #define DRV_MODULE_GEN_SUBMINOR 0
 
 #define DRV_MODULE_NAME                "ena"
-#ifndef DRV_MODULE_GENERATION
-#define DRV_MODULE_GENERATION \
-       __stringify(DRV_MODULE_GEN_MAJOR) "."   \
-       __stringify(DRV_MODULE_GEN_MINOR) "."   \
-       __stringify(DRV_MODULE_GEN_SUBMINOR) "K"
-#endif
 
 #define DEVICE_NAME    "Elastic Network Adapter (ENA)"
 
 #define ENA_RX_RSS_TABLE_LOG_SIZE  7
 #define ENA_RX_RSS_TABLE_SIZE  (1 << ENA_RX_RSS_TABLE_LOG_SIZE)
 
-#define ENA_HASH_KEY_SIZE      40
-
 /* The number of tx packet completions that will be handled each NAPI poll
  * cycle is ring_size / ENA_TX_POLL_BUDGET_DIVIDER.
  */
 #define ENA_IO_IRQ_FIRST_IDX           1
 #define ENA_IO_IRQ_IDX(q)              (ENA_IO_IRQ_FIRST_IDX + (q))
 
+#define ENA_ADMIN_POLL_DELAY_US 100
+
 /* ENA device should send keep alive msg every 1 sec.
  * We wait for 6 sec just to be on the safe side.
  */
index 04fcafcc059c471f28e8c07d71646b6eb3658721..b514bb1b855d09837355f4111d59c15c574f5583 100644 (file)
@@ -154,4 +154,4 @@ enum ena_regs_reset_reason_types {
 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_SHIFT          16
 #define ENA_REGS_RSS_IND_ENTRY_UPDATE_CQ_IDX_MASK           0xffff0000
 
-#endif /*_ENA_REGS_H_ */
+#endif /* _ENA_REGS_H_ */
index 86fc77d85fdae9e3090488dc2a65f4d173e56ca4..743d3b13b39d7d4fe9072b3cfd9996ee894e2644 100644 (file)
@@ -88,13 +88,13 @@ static const char aq_ethtool_stat_names[][ETH_GSTRING_LEN] = {
        "InDroppedDma",
 };
 
-static const char aq_ethtool_queue_stat_names[][ETH_GSTRING_LEN] = {
-       "Queue[%d] InPackets",
-       "Queue[%d] OutPackets",
-       "Queue[%d] Restarts",
-       "Queue[%d] InJumboPackets",
-       "Queue[%d] InLroPackets",
-       "Queue[%d] InErrors",
+static const char * const aq_ethtool_queue_stat_names[] = {
+       "%sQueue[%d] InPackets",
+       "%sQueue[%d] OutPackets",
+       "%sQueue[%d] Restarts",
+       "%sQueue[%d] InJumboPackets",
+       "%sQueue[%d] InLroPackets",
+       "%sQueue[%d] InErrors",
 };
 
 #if IS_ENABLED(CONFIG_MACSEC)
@@ -166,7 +166,8 @@ static u32 aq_ethtool_n_stats(struct net_device *ndev)
        struct aq_nic_s *nic = netdev_priv(ndev);
        struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(nic);
        u32 n_stats = ARRAY_SIZE(aq_ethtool_stat_names) +
-                     ARRAY_SIZE(aq_ethtool_queue_stat_names) * cfg->vecs;
+                     ARRAY_SIZE(aq_ethtool_queue_stat_names) * cfg->vecs *
+                       cfg->tcs;
 
 #if IS_ENABLED(CONFIG_MACSEC)
        if (nic->macsec_cfg) {
@@ -223,7 +224,7 @@ static void aq_ethtool_get_drvinfo(struct net_device *ndev,
 static void aq_ethtool_get_strings(struct net_device *ndev,
                                   u32 stringset, u8 *data)
 {
-       struct aq_nic_s *aq_nic = netdev_priv(ndev);
+       struct aq_nic_s *nic = netdev_priv(ndev);
        struct aq_nic_cfg_s *cfg;
        u8 *p = data;
        int i, si;
@@ -231,24 +232,35 @@ static void aq_ethtool_get_strings(struct net_device *ndev,
        int sa;
 #endif
 
-       cfg = aq_nic_get_cfg(aq_nic);
+       cfg = aq_nic_get_cfg(nic);
 
        switch (stringset) {
-       case ETH_SS_STATS:
+       case ETH_SS_STATS: {
+               const int stat_cnt = ARRAY_SIZE(aq_ethtool_queue_stat_names);
+               char tc_string[8];
+               int tc;
+
+               memset(tc_string, 0, sizeof(tc_string));
                memcpy(p, aq_ethtool_stat_names,
                       sizeof(aq_ethtool_stat_names));
                p = p + sizeof(aq_ethtool_stat_names);
-               for (i = 0; i < cfg->vecs; i++) {
-                       for (si = 0;
-                               si < ARRAY_SIZE(aq_ethtool_queue_stat_names);
-                               si++) {
-                               snprintf(p, ETH_GSTRING_LEN,
-                                        aq_ethtool_queue_stat_names[si], i);
-                               p += ETH_GSTRING_LEN;
+
+               for (tc = 0; tc < cfg->tcs; tc++) {
+                       if (cfg->is_qos)
+                               snprintf(tc_string, 8, "TC%d ", tc);
+
+                       for (i = 0; i < cfg->vecs; i++) {
+                               for (si = 0; si < stat_cnt; si++) {
+                                       snprintf(p, ETH_GSTRING_LEN,
+                                            aq_ethtool_queue_stat_names[si],
+                                            tc_string,
+                                            AQ_NIC_CFG_TCVEC2RING(cfg, tc, i));
+                                       p += ETH_GSTRING_LEN;
+                               }
                        }
                }
 #if IS_ENABLED(CONFIG_MACSEC)
-               if (!aq_nic->macsec_cfg)
+               if (!nic->macsec_cfg)
                        break;
 
                memcpy(p, aq_macsec_stat_names, sizeof(aq_macsec_stat_names));
@@ -256,7 +268,7 @@ static void aq_ethtool_get_strings(struct net_device *ndev,
                for (i = 0; i < AQ_MACSEC_MAX_SC; i++) {
                        struct aq_macsec_txsc *aq_txsc;
 
-                       if (!(test_bit(i, &aq_nic->macsec_cfg->txsc_idx_busy)))
+                       if (!(test_bit(i, &nic->macsec_cfg->txsc_idx_busy)))
                                continue;
 
                        for (si = 0;
@@ -266,7 +278,7 @@ static void aq_ethtool_get_strings(struct net_device *ndev,
                                         aq_macsec_txsc_stat_names[si], i);
                                p += ETH_GSTRING_LEN;
                        }
-                       aq_txsc = &aq_nic->macsec_cfg->aq_txsc[i];
+                       aq_txsc = &nic->macsec_cfg->aq_txsc[i];
                        for (sa = 0; sa < MACSEC_NUM_AN; sa++) {
                                if (!(test_bit(sa, &aq_txsc->tx_sa_idx_busy)))
                                        continue;
@@ -283,10 +295,10 @@ static void aq_ethtool_get_strings(struct net_device *ndev,
                for (i = 0; i < AQ_MACSEC_MAX_SC; i++) {
                        struct aq_macsec_rxsc *aq_rxsc;
 
-                       if (!(test_bit(i, &aq_nic->macsec_cfg->rxsc_idx_busy)))
+                       if (!(test_bit(i, &nic->macsec_cfg->rxsc_idx_busy)))
                                continue;
 
-                       aq_rxsc = &aq_nic->macsec_cfg->aq_rxsc[i];
+                       aq_rxsc = &nic->macsec_cfg->aq_rxsc[i];
                        for (sa = 0; sa < MACSEC_NUM_AN; sa++) {
                                if (!(test_bit(sa, &aq_rxsc->rx_sa_idx_busy)))
                                        continue;
@@ -302,6 +314,7 @@ static void aq_ethtool_get_strings(struct net_device *ndev,
                }
 #endif
                break;
+       }
        case ETH_SS_PRIV_FLAGS:
                memcpy(p, aq_ethtool_priv_flag_names,
                       sizeof(aq_ethtool_priv_flag_names));
@@ -780,8 +793,6 @@ static int aq_set_ringparam(struct net_device *ndev,
                dev_close(ndev);
        }
 
-       aq_nic_free_vectors(aq_nic);
-
        cfg->rxds = max(ring->rx_pending, hw_caps->rxds_min);
        cfg->rxds = min(cfg->rxds, hw_caps->rxds_max);
        cfg->rxds = ALIGN(cfg->rxds, AQ_HW_RXD_MULTIPLE);
@@ -790,15 +801,10 @@ static int aq_set_ringparam(struct net_device *ndev,
        cfg->txds = min(cfg->txds, hw_caps->txds_max);
        cfg->txds = ALIGN(cfg->txds, AQ_HW_TXD_MULTIPLE);
 
-       for (aq_nic->aq_vecs = 0; aq_nic->aq_vecs < cfg->vecs;
-            aq_nic->aq_vecs++) {
-               aq_nic->aq_vec[aq_nic->aq_vecs] =
-                   aq_vec_alloc(aq_nic, aq_nic->aq_vecs, cfg);
-               if (unlikely(!aq_nic->aq_vec[aq_nic->aq_vecs])) {
-                       err = -ENOMEM;
-                       goto err_exit;
-               }
-       }
+       err = aq_nic_realloc_vectors(aq_nic);
+       if (err)
+               goto err_exit;
+
        if (ndev_running)
                err = dev_open(ndev, NULL);
 
index 03ff92bc4a7fb11c97e4e128609ada081b0dfb44..1bc4d33a0ce5410e78167df88363254d03c5e78d 100644 (file)
@@ -153,6 +153,8 @@ aq_check_approve_fvlan(struct aq_nic_s *aq_nic,
                       struct aq_hw_rx_fltrs_s *rx_fltrs,
                       struct ethtool_rx_flow_spec *fsp)
 {
+       struct aq_nic_cfg_s *cfg = &aq_nic->aq_nic_cfg;
+
        if (fsp->location < AQ_RX_FIRST_LOC_FVLANID ||
            fsp->location > AQ_RX_LAST_LOC_FVLANID) {
                netdev_err(aq_nic->ndev,
@@ -170,10 +172,10 @@ aq_check_approve_fvlan(struct aq_nic_s *aq_nic,
                return -EINVAL;
        }
 
-       if (fsp->ring_cookie > aq_nic->aq_nic_cfg.num_rss_queues) {
+       if (fsp->ring_cookie > cfg->num_rss_queues * cfg->tcs) {
                netdev_err(aq_nic->ndev,
                           "ethtool: queue number must be in range [0, %d]",
-                          aq_nic->aq_nic_cfg.num_rss_queues - 1);
+                          cfg->num_rss_queues * cfg->tcs - 1);
                return -EINVAL;
        }
        return 0;
@@ -262,6 +264,7 @@ static bool __must_check
 aq_rule_is_not_correct(struct aq_nic_s *aq_nic,
                       struct ethtool_rx_flow_spec *fsp)
 {
+       struct aq_nic_cfg_s *cfg = &aq_nic->aq_nic_cfg;
        bool rule_is_not_correct = false;
 
        if (!aq_nic) {
@@ -274,11 +277,11 @@ aq_rule_is_not_correct(struct aq_nic_s *aq_nic,
        } else if (aq_check_filter(aq_nic, fsp)) {
                rule_is_not_correct = true;
        } else if (fsp->ring_cookie != RX_CLS_FLOW_DISC) {
-               if (fsp->ring_cookie >= aq_nic->aq_nic_cfg.num_rss_queues) {
+               if (fsp->ring_cookie >= cfg->num_rss_queues * cfg->tcs) {
                        netdev_err(aq_nic->ndev,
                                   "ethtool: The specified action is invalid.\n"
                                   "Maximum allowable value action is %u.\n",
-                                  aq_nic->aq_nic_cfg.num_rss_queues - 1);
+                                  cfg->num_rss_queues * cfg->tcs - 1);
                        rule_is_not_correct = true;
                }
        }
index 03fea9469f01373aa6155373f7da97de2d6a5bb0..ed5b465bc6640d00b29f1cc388874bbaf523d29f 100644 (file)
 #define AQ_HW_MAC_COUNTER_HZ   312500000ll
 #define AQ_HW_PHY_COUNTER_HZ   160000000ll
 
+enum aq_tc_mode {
+       AQ_TC_MODE_INVALID = -1,
+       AQ_TC_MODE_8TCS,
+       AQ_TC_MODE_4TCS,
+};
+
 #define AQ_RX_FIRST_LOC_FVLANID     0U
 #define AQ_RX_LAST_LOC_FVLANID    15U
 #define AQ_RX_FIRST_LOC_FETHERT    16U
@@ -29,6 +35,9 @@
                        (AQ_RX_LAST_LOC_FVLANID - AQ_RX_FIRST_LOC_FVLANID + 1U)
 #define AQ_RX_QUEUE_NOT_ASSIGNED   0xFFU
 
+/* Used for rate to Mbps conversion */
+#define AQ_MBPS_DIVISOR         125000 /* 1000000 / 8 */
+
 /* NIC H/W capabilities */
 struct aq_hw_caps_s {
        u64 hw_features;
@@ -46,7 +55,7 @@ struct aq_hw_caps_s {
        u32 mac_regs_count;
        u32 hw_alive_check_addr;
        u8 msix_irqs;
-       u8 tcs;
+       u8 tcs_max;
        u8 rxd_alignment;
        u8 rxd_size;
        u8 txd_alignment;
@@ -118,8 +127,11 @@ struct aq_stats_s {
 #define AQ_HW_TXD_MULTIPLE 8U
 #define AQ_HW_RXD_MULTIPLE 8U
 
+#define AQ_HW_QUEUES_MAX                32U
 #define AQ_HW_MULTICAST_ADDRESS_MAX     32U
 
+#define AQ_HW_PTP_TC                    2U
+
 #define AQ_HW_LED_BLINK    0x2U
 #define AQ_HW_LED_DEFAULT  0x0U
 
@@ -268,6 +280,8 @@ struct aq_hw_ops {
        int (*hw_rss_hash_set)(struct aq_hw_s *self,
                               struct aq_rss_parameters *rss_params);
 
+       int (*hw_tc_rate_limit_set)(struct aq_hw_s *self);
+
        int (*hw_get_regs)(struct aq_hw_s *self,
                           const struct aq_hw_caps_s *aq_hw_caps,
                           u32 *regs_buff);
@@ -279,10 +293,6 @@ struct aq_hw_ops {
        int (*hw_set_offload)(struct aq_hw_s *self,
                              struct aq_nic_cfg_s *aq_nic_cfg);
 
-       int (*hw_tx_tc_mode_get)(struct aq_hw_s *self, u32 *tc_mode);
-
-       int (*hw_rx_tc_mode_get)(struct aq_hw_s *self, u32 *tc_mode);
-
        int (*hw_ring_hwts_rx_fill)(struct aq_hw_s *self,
                                    struct aq_ring_s *aq_ring);
 
index 7dbf49adcea6c0436d1fcba5f68a49de01ea3c27..342c5179f846920bb0eea684bbbce9c119c889b8 100644 (file)
@@ -79,3 +79,29 @@ int aq_hw_err_from_flags(struct aq_hw_s *hw)
 err_exit:
        return err;
 }
+
+int aq_hw_num_tcs(struct aq_hw_s *hw)
+{
+       switch (hw->aq_nic_cfg->tc_mode) {
+       case AQ_TC_MODE_8TCS:
+               return 8;
+       case AQ_TC_MODE_4TCS:
+               return 4;
+       default:
+               break;
+       }
+
+       return 1;
+}
+
+int aq_hw_q_per_tc(struct aq_hw_s *hw)
+{
+       switch (hw->aq_nic_cfg->tc_mode) {
+       case AQ_TC_MODE_8TCS:
+               return 4;
+       case AQ_TC_MODE_4TCS:
+               return 8;
+       default:
+               return 4;
+       }
+}
index 9ef82d487e018acc48953303d9df4f07fbea8c09..32aa5f2fb84060d9d3985b57c9f2602778daa6ac 100644 (file)
@@ -34,5 +34,7 @@ u32 aq_hw_read_reg(struct aq_hw_s *hw, u32 reg);
 void aq_hw_write_reg(struct aq_hw_s *hw, u32 reg, u32 value);
 u64 aq_hw_read_reg64(struct aq_hw_s *hw, u32 reg);
 int aq_hw_err_from_flags(struct aq_hw_s *hw);
+int aq_hw_num_tcs(struct aq_hw_s *hw);
+int aq_hw_q_per_tc(struct aq_hw_s *hw);
 
 #endif /* AQ_HW_UTILS_H */
index 91870ceaf3fe0a5b8eb442883929a2ee8517c3c6..4a6dfac857ca962575ad0e8810c083313016e73a 100644 (file)
@@ -478,7 +478,7 @@ static int aq_mdo_add_secy(struct macsec_context *ctx)
 
        set_bit(txsc_idx, &cfg->txsc_idx_busy);
 
-       return 0;
+       return ret;
 }
 
 static int aq_mdo_upd_secy(struct macsec_context *ctx)
index 9fcab646cbd5af0b40756c4810c8c0dba9170613..8a1da044e9086254f9c005947b3768d77abee505 100644 (file)
 #include "aq_ethtool.h"
 #include "aq_ptp.h"
 #include "aq_filters.h"
+#include "aq_hw_utils.h"
 
 #include <linux/netdevice.h>
 #include <linux/module.h>
 #include <linux/ip.h>
 #include <linux/udp.h>
+#include <net/pkt_cls.h>
 
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR(AQ_CFG_DRV_AUTHOR);
@@ -38,7 +40,7 @@ struct net_device *aq_ndev_alloc(void)
        struct net_device *ndev = NULL;
        struct aq_nic_s *aq_nic = NULL;
 
-       ndev = alloc_etherdev_mq(sizeof(struct aq_nic_s), AQ_CFG_VECS_MAX);
+       ndev = alloc_etherdev_mq(sizeof(struct aq_nic_s), AQ_HW_QUEUES_MAX);
        if (!ndev)
                return NULL;
 
@@ -330,6 +332,73 @@ static int aq_ndo_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto,
        return 0;
 }
 
+static int aq_validate_mqprio_opt(struct aq_nic_s *self,
+                                 struct tc_mqprio_qopt_offload *mqprio,
+                                 const unsigned int num_tc)
+{
+       const bool has_min_rate = !!(mqprio->flags & TC_MQPRIO_F_MIN_RATE);
+       struct aq_nic_cfg_s *aq_nic_cfg = aq_nic_get_cfg(self);
+       const unsigned int tcs_max = min_t(u8, aq_nic_cfg->aq_hw_caps->tcs_max,
+                                          AQ_CFG_TCS_MAX);
+
+       if (num_tc > tcs_max) {
+               netdev_err(self->ndev, "Too many TCs requested\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (num_tc != 0 && !is_power_of_2(num_tc)) {
+               netdev_err(self->ndev, "TC count should be power of 2\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (has_min_rate && !ATL_HW_IS_CHIP_FEATURE(self->aq_hw, ANTIGUA)) {
+               netdev_err(self->ndev, "Min tx rate is not supported\n");
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
+static int aq_ndo_setup_tc(struct net_device *dev, enum tc_setup_type type,
+                          void *type_data)
+{
+       struct tc_mqprio_qopt_offload *mqprio = type_data;
+       struct aq_nic_s *aq_nic = netdev_priv(dev);
+       bool has_min_rate;
+       bool has_max_rate;
+       int err;
+       int i;
+
+       if (type != TC_SETUP_QDISC_MQPRIO)
+               return -EOPNOTSUPP;
+
+       has_min_rate = !!(mqprio->flags & TC_MQPRIO_F_MIN_RATE);
+       has_max_rate = !!(mqprio->flags & TC_MQPRIO_F_MAX_RATE);
+
+       err = aq_validate_mqprio_opt(aq_nic, mqprio, mqprio->qopt.num_tc);
+       if (err)
+               return err;
+
+       for (i = 0; i < mqprio->qopt.num_tc; i++) {
+               if (has_max_rate) {
+                       u64 max_rate = mqprio->max_rate[i];
+
+                       do_div(max_rate, AQ_MBPS_DIVISOR);
+                       aq_nic_setup_tc_max_rate(aq_nic, i, (u32)max_rate);
+               }
+
+               if (has_min_rate) {
+                       u64 min_rate = mqprio->min_rate[i];
+
+                       do_div(min_rate, AQ_MBPS_DIVISOR);
+                       aq_nic_setup_tc_min_rate(aq_nic, i, (u32)min_rate);
+               }
+       }
+
+       return aq_nic_setup_tc_mqprio(aq_nic, mqprio->qopt.num_tc,
+                                     mqprio->qopt.prio_tc_map);
+}
+
 static const struct net_device_ops aq_ndev_ops = {
        .ndo_open = aq_ndev_open,
        .ndo_stop = aq_ndev_close,
@@ -341,6 +410,7 @@ static const struct net_device_ops aq_ndev_ops = {
        .ndo_do_ioctl = aq_ndev_ioctl,
        .ndo_vlan_rx_add_vid = aq_ndo_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid = aq_ndo_vlan_rx_kill_vid,
+       .ndo_setup_tc = aq_ndo_setup_tc,
 };
 
 static int __init aq_ndev_init_module(void)
index 1c6d12deb47a74a1f1797b94df87340e127ab361..4435c6374f7e052cb972c58b7cf7db8fab44bb9f 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <net/ip.h>
+#include <net/pkt_cls.h>
 
 static unsigned int aq_itr = AQ_CFG_INTERRUPT_MODERATION_AUTO;
 module_param_named(aq_itr, aq_itr, uint, 0644);
@@ -64,10 +65,38 @@ static void aq_nic_rss_init(struct aq_nic_s *self, unsigned int num_rss_queues)
                rss_params->indirection_table[i] = i & (num_rss_queues - 1);
 }
 
+/* Recalculate the number of vectors */
+static void aq_nic_cfg_update_num_vecs(struct aq_nic_s *self)
+{
+       struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+
+       cfg->vecs = min(cfg->aq_hw_caps->vecs, AQ_CFG_VECS_DEF);
+       cfg->vecs = min(cfg->vecs, num_online_cpus());
+       if (self->irqvecs > AQ_HW_SERVICE_IRQS)
+               cfg->vecs = min(cfg->vecs, self->irqvecs - AQ_HW_SERVICE_IRQS);
+       /* cfg->vecs should be power of 2 for RSS */
+       cfg->vecs = rounddown_pow_of_two(cfg->vecs);
+
+       if (ATL_HW_IS_CHIP_FEATURE(self->aq_hw, ANTIGUA)) {
+               if (cfg->tcs > 2)
+                       cfg->vecs = min(cfg->vecs, 4U);
+       }
+
+       if (cfg->vecs <= 4)
+               cfg->tc_mode = AQ_TC_MODE_8TCS;
+       else
+               cfg->tc_mode = AQ_TC_MODE_4TCS;
+
+       /*rss rings */
+       cfg->num_rss_queues = min(cfg->vecs, AQ_CFG_NUM_RSS_QUEUES_DEF);
+       aq_nic_rss_init(self, cfg->num_rss_queues);
+}
+
 /* Checks hw_caps and 'corrects' aq_nic_cfg in runtime */
 void aq_nic_cfg_start(struct aq_nic_s *self)
 {
        struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+       int i;
 
        cfg->tcs = AQ_CFG_TCS_DEF;
 
@@ -79,7 +108,6 @@ void aq_nic_cfg_start(struct aq_nic_s *self)
 
        cfg->rxpageorder = AQ_CFG_RX_PAGEORDER;
        cfg->is_rss = AQ_CFG_IS_RSS_DEF;
-       cfg->num_rss_queues = AQ_CFG_NUM_RSS_QUEUES_DEF;
        cfg->aq_rss.base_cpu_number = AQ_CFG_RSS_BASE_CPU_NUM_DEF;
        cfg->fc.req = AQ_CFG_FC_MODE;
        cfg->wol = AQ_CFG_WOL_MODES;
@@ -89,29 +117,13 @@ void aq_nic_cfg_start(struct aq_nic_s *self)
        cfg->is_autoneg = AQ_CFG_IS_AUTONEG_DEF;
 
        cfg->is_lro = AQ_CFG_IS_LRO_DEF;
+       cfg->is_ptp = true;
 
        /*descriptors */
        cfg->rxds = min(cfg->aq_hw_caps->rxds_max, AQ_CFG_RXDS_DEF);
        cfg->txds = min(cfg->aq_hw_caps->txds_max, AQ_CFG_TXDS_DEF);
 
-       /*rss rings */
-       cfg->vecs = min(cfg->aq_hw_caps->vecs, AQ_CFG_VECS_DEF);
-       cfg->vecs = min(cfg->vecs, num_online_cpus());
-       if (self->irqvecs > AQ_HW_SERVICE_IRQS)
-               cfg->vecs = min(cfg->vecs, self->irqvecs - AQ_HW_SERVICE_IRQS);
-       /* cfg->vecs should be power of 2 for RSS */
-       if (cfg->vecs >= 8U)
-               cfg->vecs = 8U;
-       else if (cfg->vecs >= 4U)
-               cfg->vecs = 4U;
-       else if (cfg->vecs >= 2U)
-               cfg->vecs = 2U;
-       else
-               cfg->vecs = 1U;
-
-       cfg->num_rss_queues = min(cfg->vecs, AQ_CFG_NUM_RSS_QUEUES_DEF);
-
-       aq_nic_rss_init(self, cfg->num_rss_queues);
+       aq_nic_cfg_update_num_vecs(self);
 
        cfg->irq_type = aq_pci_func_get_irq_type(self);
 
@@ -136,6 +148,9 @@ void aq_nic_cfg_start(struct aq_nic_s *self)
        cfg->is_vlan_rx_strip = !!(cfg->features & NETIF_F_HW_VLAN_CTAG_RX);
        cfg->is_vlan_tx_insert = !!(cfg->features & NETIF_F_HW_VLAN_CTAG_TX);
        cfg->is_vlan_force_promisc = true;
+
+       for (i = 0; i < sizeof(cfg->prio_tc_map); i++)
+               cfg->prio_tc_map[i] = cfg->tcs * i / 8;
 }
 
 static int aq_nic_update_link_status(struct aq_nic_s *self)
@@ -181,6 +196,9 @@ static int aq_nic_update_link_status(struct aq_nic_s *self)
 #if IS_ENABLED(CONFIG_MACSEC)
                aq_macsec_enable(self);
 #endif
+               if (self->aq_hw_ops->hw_tc_rate_limit_set)
+                       self->aq_hw_ops->hw_tc_rate_limit_set(self->aq_hw);
+
                netif_tx_wake_all_queues(self->ndev);
        }
        if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) {
@@ -399,21 +417,29 @@ int aq_nic_init(struct aq_nic_s *self)
                err = aq_phy_init(self->aq_hw);
        }
 
-       for (i = 0U, aq_vec = self->aq_vec[0];
-               self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
+       for (i = 0U; i < self->aq_vecs; i++) {
+               aq_vec = self->aq_vec[i];
+               err = aq_vec_ring_alloc(aq_vec, self, i,
+                                       aq_nic_get_cfg(self));
+               if (err)
+                       goto err_exit;
+
                aq_vec_init(aq_vec, self->aq_hw_ops, self->aq_hw);
+       }
 
-       err = aq_ptp_init(self, self->irqvecs - 1);
-       if (err < 0)
-               goto err_exit;
+       if (aq_nic_get_cfg(self)->is_ptp) {
+               err = aq_ptp_init(self, self->irqvecs - 1);
+               if (err < 0)
+                       goto err_exit;
 
-       err = aq_ptp_ring_alloc(self);
-       if (err < 0)
-               goto err_exit;
+               err = aq_ptp_ring_alloc(self);
+               if (err < 0)
+                       goto err_exit;
 
-       err = aq_ptp_ring_init(self);
-       if (err < 0)
-               goto err_exit;
+               err = aq_ptp_ring_init(self);
+               if (err < 0)
+                       goto err_exit;
+       }
 
        netif_carrier_off(self->ndev);
 
@@ -424,9 +450,12 @@ err_exit:
 int aq_nic_start(struct aq_nic_s *self)
 {
        struct aq_vec_s *aq_vec = NULL;
+       struct aq_nic_cfg_s *cfg;
        unsigned int i = 0U;
        int err = 0;
 
+       cfg = aq_nic_get_cfg(self);
+
        err = self->aq_hw_ops->hw_multicast_list_set(self->aq_hw,
                                                     self->mc_list.ar,
                                                     self->mc_list.count);
@@ -464,7 +493,7 @@ int aq_nic_start(struct aq_nic_s *self)
        timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0);
        aq_nic_service_timer_cb(&self->service_timer);
 
-       if (self->aq_nic_cfg.is_polling) {
+       if (cfg->is_polling) {
                timer_setup(&self->polling_timer, aq_nic_polling_timer_cb, 0);
                mod_timer(&self->polling_timer, jiffies +
                          AQ_CFG_POLLING_TIMER_INTERVAL);
@@ -482,16 +511,16 @@ int aq_nic_start(struct aq_nic_s *self)
                if (err < 0)
                        goto err_exit;
 
-               if (self->aq_nic_cfg.link_irq_vec) {
+               if (cfg->link_irq_vec) {
                        int irqvec = pci_irq_vector(self->pdev,
-                                                  self->aq_nic_cfg.link_irq_vec);
+                                                   cfg->link_irq_vec);
                        err = request_threaded_irq(irqvec, NULL,
                                                   aq_linkstate_threaded_isr,
                                                   IRQF_SHARED | IRQF_ONESHOT,
                                                   self->ndev->name, self);
                        if (err < 0)
                                goto err_exit;
-                       self->msix_entry_mask |= (1 << self->aq_nic_cfg.link_irq_vec);
+                       self->msix_entry_mask |= (1 << cfg->link_irq_vec);
                }
 
                err = self->aq_hw_ops->hw_irq_enable(self->aq_hw,
@@ -500,14 +529,21 @@ int aq_nic_start(struct aq_nic_s *self)
                        goto err_exit;
        }
 
-       err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs);
+       err = netif_set_real_num_tx_queues(self->ndev,
+                                          self->aq_vecs * cfg->tcs);
        if (err < 0)
                goto err_exit;
 
-       err = netif_set_real_num_rx_queues(self->ndev, self->aq_vecs);
+       err = netif_set_real_num_rx_queues(self->ndev,
+                                          self->aq_vecs * cfg->tcs);
        if (err < 0)
                goto err_exit;
 
+       for (i = 0; i < cfg->tcs; i++) {
+               u16 offset = self->aq_vecs * i;
+
+               netdev_set_tc_queue(self->ndev, i, self->aq_vecs, offset);
+       }
        netif_tx_start_all_queues(self->ndev);
 
 err_exit:
@@ -518,6 +554,8 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb,
                            struct aq_ring_s *ring)
 {
        unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
+       struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self);
+       struct device *dev = aq_nic_get_dev(self);
        struct aq_ring_buff_s *first = NULL;
        u8 ipver = ip_hdr(skb)->version;
        struct aq_ring_buff_s *dx_buff;
@@ -559,7 +597,7 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb,
                need_context_tag = true;
        }
 
-       if (self->aq_nic_cfg.is_vlan_tx_insert && skb_vlan_tag_present(skb)) {
+       if (cfg->is_vlan_tx_insert && skb_vlan_tag_present(skb)) {
                dx_buff->vlan_tx_tag = skb_vlan_tag_get(skb);
                dx_buff->len_pkt = skb->len;
                dx_buff->is_vlan = 1U;
@@ -574,12 +612,12 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb,
        }
 
        dx_buff->len = skb_headlen(skb);
-       dx_buff->pa = dma_map_single(aq_nic_get_dev(self),
+       dx_buff->pa = dma_map_single(dev,
                                     skb->data,
                                     dx_buff->len,
                                     DMA_TO_DEVICE);
 
-       if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) {
+       if (unlikely(dma_mapping_error(dev, dx_buff->pa))) {
                ret = 0;
                goto exit;
        }
@@ -611,13 +649,13 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb,
                        else
                                buff_size = frag_len;
 
-                       frag_pa = skb_frag_dma_map(aq_nic_get_dev(self),
+                       frag_pa = skb_frag_dma_map(dev,
                                                   frag,
                                                   buff_offset,
                                                   buff_size,
                                                   DMA_TO_DEVICE);
 
-                       if (unlikely(dma_mapping_error(aq_nic_get_dev(self),
+                       if (unlikely(dma_mapping_error(dev,
                                                       frag_pa)))
                                goto mapping_error;
 
@@ -651,12 +689,12 @@ mapping_error:
                if (!(dx_buff->is_gso_tcp || dx_buff->is_gso_udp) &&
                    !dx_buff->is_vlan && dx_buff->pa) {
                        if (unlikely(dx_buff->is_sop)) {
-                               dma_unmap_single(aq_nic_get_dev(self),
+                               dma_unmap_single(dev,
                                                 dx_buff->pa,
                                                 dx_buff->len,
                                                 DMA_TO_DEVICE);
                        } else {
-                               dma_unmap_page(aq_nic_get_dev(self),
+                               dma_unmap_page(dev,
                                               dx_buff->pa,
                                               dx_buff->len,
                                               DMA_TO_DEVICE);
@@ -670,15 +708,16 @@ exit:
 
 int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 {
-       unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
+       struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self);
+       unsigned int vec = skb->queue_mapping % cfg->vecs;
+       unsigned int tc = skb->queue_mapping / cfg->vecs;
        struct aq_ring_s *ring = NULL;
        unsigned int frags = 0U;
        int err = NETDEV_TX_OK;
-       unsigned int tc = 0U;
 
        frags = skb_shinfo(skb)->nr_frags + 1;
 
-       ring = self->aq_ring_tx[AQ_NIC_TCVEC2RING(self, tc, vec)];
+       ring = self->aq_ring_tx[AQ_NIC_CFG_TCVEC2RING(cfg, tc, vec)];
 
        if (frags > AQ_CFG_SKB_FRAGS_MAX) {
                dev_kfree_skb_any(skb);
@@ -687,13 +726,14 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 
        aq_ring_update_queue_state(ring);
 
-       if (self->aq_nic_cfg.priv_flags & BIT(AQ_HW_LOOPBACK_DMA_NET)) {
+       if (cfg->priv_flags & BIT(AQ_HW_LOOPBACK_DMA_NET)) {
                err = NETDEV_TX_BUSY;
                goto err_exit;
        }
 
        /* Above status update may stop the queue. Check this. */
-       if (__netif_subqueue_stopped(self->ndev, ring->idx)) {
+       if (__netif_subqueue_stopped(self->ndev,
+                                    AQ_NIC_RING2QMAP(self, ring->idx))) {
                err = NETDEV_TX_BUSY;
                goto err_exit;
        }
@@ -823,6 +863,7 @@ u64 *aq_nic_get_stats(struct aq_nic_s *self, u64 *data)
        struct aq_stats_s *stats;
        unsigned int count = 0U;
        unsigned int i = 0U;
+       unsigned int tc;
 
        if (self->aq_fw_ops->update_stats) {
                mutex_lock(&self->fwreq_mutex);
@@ -861,10 +902,13 @@ u64 *aq_nic_get_stats(struct aq_nic_s *self, u64 *data)
 
        data += i;
 
-       for (i = 0U, aq_vec = self->aq_vec[0];
-               aq_vec && self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) {
-               data += count;
-               aq_vec_get_sw_stats(aq_vec, data, &count);
+       for (tc = 0U; tc < self->aq_nic_cfg.tcs; tc++) {
+               for (i = 0U, aq_vec = self->aq_vec[0];
+                    aq_vec && self->aq_vecs > i;
+                    ++i, aq_vec = self->aq_vec[i]) {
+                       data += count;
+                       aq_vec_get_sw_stats(aq_vec, tc, data, &count);
+               }
        }
 
        data += count;
@@ -1145,9 +1189,11 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down)
        if (!self)
                goto err_exit;
 
-       for (i = 0U, aq_vec = self->aq_vec[0];
-               self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
+       for (i = 0U; i < self->aq_vecs; i++) {
+               aq_vec = self->aq_vec[i];
                aq_vec_deinit(aq_vec);
+               aq_vec_ring_free(aq_vec);
+       }
 
        aq_ptp_unregister(self);
        aq_ptp_ring_deinit(self);
@@ -1180,6 +1226,22 @@ void aq_nic_free_vectors(struct aq_nic_s *self)
 err_exit:;
 }
 
+int aq_nic_realloc_vectors(struct aq_nic_s *self)
+{
+       struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(self);
+
+       aq_nic_free_vectors(self);
+
+       for (self->aq_vecs = 0; self->aq_vecs < cfg->vecs; self->aq_vecs++) {
+               self->aq_vec[self->aq_vecs] = aq_vec_alloc(self, self->aq_vecs,
+                                                          cfg);
+               if (unlikely(!self->aq_vec[self->aq_vecs]))
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
 void aq_nic_shutdown(struct aq_nic_s *self)
 {
        int err = 0;
@@ -1245,3 +1307,98 @@ void aq_nic_release_filter(struct aq_nic_s *self, enum aq_rx_filter_type type,
                break;
        }
 }
+
+int aq_nic_setup_tc_mqprio(struct aq_nic_s *self, u32 tcs, u8 *prio_tc_map)
+{
+       struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+       const unsigned int prev_vecs = cfg->vecs;
+       bool ndev_running;
+       int err = 0;
+       int i;
+
+       /* if already the same configuration or
+        * disable request (tcs is 0) and we already is disabled
+        */
+       if (tcs == cfg->tcs || (tcs == 0 && !cfg->is_qos))
+               return 0;
+
+       ndev_running = netif_running(self->ndev);
+       if (ndev_running)
+               dev_close(self->ndev);
+
+       cfg->tcs = tcs;
+       if (cfg->tcs == 0)
+               cfg->tcs = 1;
+       if (prio_tc_map)
+               memcpy(cfg->prio_tc_map, prio_tc_map, sizeof(cfg->prio_tc_map));
+       else
+               for (i = 0; i < sizeof(cfg->prio_tc_map); i++)
+                       cfg->prio_tc_map[i] = cfg->tcs * i / 8;
+
+       cfg->is_qos = (tcs != 0 ? true : false);
+       cfg->is_ptp = (cfg->tcs <= AQ_HW_PTP_TC);
+       if (!cfg->is_ptp)
+               netdev_warn(self->ndev, "%s\n",
+                           "PTP is auto disabled due to requested TC count.");
+
+       netdev_set_num_tc(self->ndev, cfg->tcs);
+
+       /* Changing the number of TCs might change the number of vectors */
+       aq_nic_cfg_update_num_vecs(self);
+       if (prev_vecs != cfg->vecs) {
+               err = aq_nic_realloc_vectors(self);
+               if (err)
+                       goto err_exit;
+       }
+
+       if (ndev_running)
+               err = dev_open(self->ndev, NULL);
+
+err_exit:
+       return err;
+}
+
+int aq_nic_setup_tc_max_rate(struct aq_nic_s *self, const unsigned int tc,
+                            const u32 max_rate)
+{
+       struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+
+       if (tc >= AQ_CFG_TCS_MAX)
+               return -EINVAL;
+
+       if (max_rate && max_rate < 10) {
+               netdev_warn(self->ndev,
+                       "Setting %s to the minimum usable value of %dMbps.\n",
+                       "max rate", 10);
+               cfg->tc_max_rate[tc] = 10;
+       } else {
+               cfg->tc_max_rate[tc] = max_rate;
+       }
+
+       return 0;
+}
+
+int aq_nic_setup_tc_min_rate(struct aq_nic_s *self, const unsigned int tc,
+                            const u32 min_rate)
+{
+       struct aq_nic_cfg_s *cfg = &self->aq_nic_cfg;
+
+       if (tc >= AQ_CFG_TCS_MAX)
+               return -EINVAL;
+
+       if (min_rate)
+               set_bit(tc, &cfg->tc_min_rate_msk);
+       else
+               clear_bit(tc, &cfg->tc_min_rate_msk);
+
+       if (min_rate && min_rate < 20) {
+               netdev_warn(self->ndev,
+                       "Setting %s to the minimum usable value of %dMbps.\n",
+                       "min rate", 20);
+               cfg->tc_min_rate[tc] = 20;
+       } else {
+               cfg->tc_min_rate[tc] = min_rate;
+       }
+
+       return 0;
+}
index 0663b8d0220df9da6be4d3012f33dea909bb64fd..2ab003065e6245379f40b9c50a922ed8e27fd921 100644 (file)
@@ -59,8 +59,15 @@ struct aq_nic_cfg_s {
        bool is_polling;
        bool is_rss;
        bool is_lro;
+       bool is_qos;
+       bool is_ptp;
+       enum aq_tc_mode tc_mode;
        u32 priv_flags;
        u8  tcs;
+       u8 prio_tc_map[8];
+       u32 tc_max_rate[AQ_CFG_TCS_MAX];
+       unsigned long tc_min_rate_msk;
+       u32 tc_min_rate[AQ_CFG_TCS_MAX];
        struct aq_rss_parameters aq_rss;
        u32 eee_speeds;
 };
@@ -77,8 +84,16 @@ struct aq_nic_cfg_s {
 #define AQ_NIC_WOL_MODES        (WAKE_MAGIC |\
                                 WAKE_PHY)
 
-#define AQ_NIC_TCVEC2RING(_NIC_, _TC_, _VEC_) \
-       ((_TC_) * AQ_CFG_TCS_MAX + (_VEC_))
+#define AQ_NIC_CFG_RING_PER_TC(_NIC_CFG_) \
+       (((_NIC_CFG_)->tc_mode == AQ_TC_MODE_4TCS) ? 8 : 4)
+
+#define AQ_NIC_CFG_TCVEC2RING(_NIC_CFG_, _TC_, _VEC_) \
+       ((_TC_) * AQ_NIC_CFG_RING_PER_TC(_NIC_CFG_) + (_VEC_))
+
+#define AQ_NIC_RING2QMAP(_NIC_, _ID_) \
+       ((_ID_) / AQ_NIC_CFG_RING_PER_TC(&(_NIC_)->aq_nic_cfg) * \
+               (_NIC_)->aq_vecs + \
+       ((_ID_) % AQ_NIC_CFG_RING_PER_TC(&(_NIC_)->aq_nic_cfg)))
 
 struct aq_hw_rx_fl2 {
        struct aq_rx_filter_vlan aq_vlans[AQ_VLAN_MAX_FILTERS];
@@ -104,7 +119,7 @@ struct aq_nic_s {
        atomic_t flags;
        u32 msg_enable;
        struct aq_vec_s *aq_vec[AQ_CFG_VECS_MAX];
-       struct aq_ring_s *aq_ring_tx[AQ_CFG_VECS_MAX * AQ_CFG_TCS_MAX];
+       struct aq_ring_s *aq_ring_tx[AQ_HW_QUEUES_MAX];
        struct aq_hw_s *aq_hw;
        struct net_device *ndev;
        unsigned int aq_vecs;
@@ -164,6 +179,7 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down);
 void aq_nic_set_power(struct aq_nic_s *self);
 void aq_nic_free_hot_resources(struct aq_nic_s *self);
 void aq_nic_free_vectors(struct aq_nic_s *self);
+int aq_nic_realloc_vectors(struct aq_nic_s *self);
 int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu);
 int aq_nic_set_mac(struct aq_nic_s *self, struct net_device *ndev);
 int aq_nic_set_packet_filter(struct aq_nic_s *self, unsigned int flags);
@@ -181,4 +197,9 @@ void aq_nic_shutdown(struct aq_nic_s *self);
 u8 aq_nic_reserve_filter(struct aq_nic_s *self, enum aq_rx_filter_type type);
 void aq_nic_release_filter(struct aq_nic_s *self, enum aq_rx_filter_type type,
                           u32 location);
+int aq_nic_setup_tc_mqprio(struct aq_nic_s *self, u32 tcs, u8 *prio_tc_map);
+int aq_nic_setup_tc_max_rate(struct aq_nic_s *self, const unsigned int tc,
+                            const u32 max_rate);
+int aq_nic_setup_tc_min_rate(struct aq_nic_s *self, const unsigned int tc,
+                            const u32 min_rate);
 #endif /* AQ_NIC_H */
index d10fff8a8c7176a9c38d61e3e2287d53edf8f1f9..41c0f560f95b4dba1bd60ab06f1970575a57a5d3 100644 (file)
@@ -431,6 +431,9 @@ static int atl_resume_common(struct device *dev, bool deep)
        netif_tx_start_all_queues(nic->ndev);
 
 err_exit:
+       if (ret < 0)
+               aq_nic_deinit(nic, true);
+
        rtnl_unlock();
 
        return ret;
index 58e8c641e8b3e40fc61925a77028c8ee0312a385..599ced261b2a4d136be7f5f462bbf735a8d77fd9 100644 (file)
@@ -945,26 +945,29 @@ void aq_ptp_ring_deinit(struct aq_nic_s *aq_nic)
 #define PTP_4TC_RING_IDX            16
 #define PTP_HWST_RING_IDX           31
 
+/* Index must be 8 (8 TCs) or 16 (4 TCs).
+ * It depends on Traffic Class mode.
+ */
+static unsigned int ptp_ring_idx(const enum aq_tc_mode tc_mode)
+{
+       if (tc_mode == AQ_TC_MODE_8TCS)
+               return PTP_8TC_RING_IDX;
+
+       return PTP_4TC_RING_IDX;
+}
+
 int aq_ptp_ring_alloc(struct aq_nic_s *aq_nic)
 {
        struct aq_ptp_s *aq_ptp = aq_nic->aq_ptp;
        unsigned int tx_ring_idx, rx_ring_idx;
        struct aq_ring_s *hwts;
-       u32 tx_tc_mode, rx_tc_mode;
        struct aq_ring_s *ring;
        int err;
 
        if (!aq_ptp)
                return 0;
 
-       /* Index must to be 8 (8 TCs) or 16 (4 TCs).
-        * It depends from Traffic Class mode.
-        */
-       aq_nic->aq_hw_ops->hw_tx_tc_mode_get(aq_nic->aq_hw, &tx_tc_mode);
-       if (tx_tc_mode == 0)
-               tx_ring_idx = PTP_8TC_RING_IDX;
-       else
-               tx_ring_idx = PTP_4TC_RING_IDX;
+       tx_ring_idx = ptp_ring_idx(aq_nic->aq_nic_cfg.tc_mode);
 
        ring = aq_ring_tx_alloc(&aq_ptp->ptp_tx, aq_nic,
                                tx_ring_idx, &aq_nic->aq_nic_cfg);
@@ -973,11 +976,7 @@ int aq_ptp_ring_alloc(struct aq_nic_s *aq_nic)
                goto err_exit;
        }
 
-       aq_nic->aq_hw_ops->hw_rx_tc_mode_get(aq_nic->aq_hw, &rx_tc_mode);
-       if (rx_tc_mode == 0)
-               rx_ring_idx = PTP_8TC_RING_IDX;
-       else
-               rx_ring_idx = PTP_4TC_RING_IDX;
+       rx_ring_idx = ptp_ring_idx(aq_nic->aq_nic_cfg.tc_mode);
 
        ring = aq_ring_rx_alloc(&aq_ptp->ptp_rx, aq_nic,
                                rx_ring_idx, &aq_nic->aq_nic_cfg);
index bae95a61856081afc83e36c43fd1b1bc59634d7a..68fdb39940888d83abf79d6391dd26382dc9bb79 100644 (file)
@@ -232,8 +232,11 @@ void aq_ring_queue_wake(struct aq_ring_s *ring)
 {
        struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
 
-       if (__netif_subqueue_stopped(ndev, ring->idx)) {
-               netif_wake_subqueue(ndev, ring->idx);
+       if (__netif_subqueue_stopped(ndev,
+                                    AQ_NIC_RING2QMAP(ring->aq_nic,
+                                                     ring->idx))) {
+               netif_wake_subqueue(ndev,
+                                   AQ_NIC_RING2QMAP(ring->aq_nic, ring->idx));
                ring->stats.tx.queue_restarts++;
        }
 }
@@ -242,8 +245,11 @@ void aq_ring_queue_stop(struct aq_ring_s *ring)
 {
        struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
 
-       if (!__netif_subqueue_stopped(ndev, ring->idx))
-               netif_stop_subqueue(ndev, ring->idx);
+       if (!__netif_subqueue_stopped(ndev,
+                                     AQ_NIC_RING2QMAP(ring->aq_nic,
+                                                      ring->idx)))
+               netif_stop_subqueue(ndev,
+                                   AQ_NIC_RING2QMAP(ring->aq_nic, ring->idx));
 }
 
 bool aq_ring_tx_clean(struct aq_ring_s *self)
@@ -466,7 +472,10 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
                             buff->is_hash_l4 ? PKT_HASH_TYPE_L4 :
                             PKT_HASH_TYPE_NONE);
                /* Send all PTP traffic to 0 queue */
-               skb_record_rx_queue(skb, is_ptp_ring ? 0 : self->idx);
+               skb_record_rx_queue(skb,
+                                   is_ptp_ring ? 0
+                                               : AQ_NIC_RING2QMAP(self->aq_nic,
+                                                                  self->idx));
 
                ++self->stats.rx.packets;
                self->stats.rx.bytes += skb->len;
index f40a427970dcee2234dc5bfbdfe912ebf81ab023..d1d43c8ce400bae2dd5e847505e4142dbdc680e8 100644 (file)
@@ -103,16 +103,11 @@ err_exit:
 struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx,
                              struct aq_nic_cfg_s *aq_nic_cfg)
 {
-       struct aq_ring_s *ring = NULL;
        struct aq_vec_s *self = NULL;
-       unsigned int i = 0U;
-       int err = 0;
 
        self = kzalloc(sizeof(*self), GFP_KERNEL);
-       if (!self) {
-               err = -ENOMEM;
+       if (!self)
                goto err_exit;
-       }
 
        self->aq_nic = aq_nic;
        self->aq_ring_param.vec_idx = idx;
@@ -128,10 +123,20 @@ struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx,
        netif_napi_add(aq_nic_get_ndev(aq_nic), &self->napi,
                       aq_vec_poll, AQ_CFG_NAPI_WEIGHT);
 
+err_exit:
+       return self;
+}
+
+int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic,
+                     unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg)
+{
+       struct aq_ring_s *ring = NULL;
+       unsigned int i = 0U;
+       int err = 0;
+
        for (i = 0; i < aq_nic_cfg->tcs; ++i) {
-               unsigned int idx_ring = AQ_NIC_TCVEC2RING(self->nic,
-                                               self->tx_rings,
-                                               self->aq_ring_param.vec_idx);
+               const unsigned int idx_ring = AQ_NIC_CFG_TCVEC2RING(aq_nic_cfg,
+                                                                   i, idx);
 
                ring = aq_ring_tx_alloc(&self->ring[i][AQ_VEC_TX_ID], aq_nic,
                                        idx_ring, aq_nic_cfg);
@@ -156,11 +161,11 @@ struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx,
 
 err_exit:
        if (err < 0) {
-               aq_vec_free(self);
+               aq_vec_ring_free(self);
                self = NULL;
        }
 
-       return self;
+       return err;
 }
 
 int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops,
@@ -269,6 +274,18 @@ err_exit:;
 }
 
 void aq_vec_free(struct aq_vec_s *self)
+{
+       if (!self)
+               goto err_exit;
+
+       netif_napi_del(&self->napi);
+
+       kfree(self);
+
+err_exit:;
+}
+
+void aq_vec_ring_free(struct aq_vec_s *self)
 {
        struct aq_ring_s *ring = NULL;
        unsigned int i = 0U;
@@ -279,13 +296,12 @@ void aq_vec_free(struct aq_vec_s *self)
        for (i = 0U, ring = self->ring[0];
                self->tx_rings > i; ++i, ring = self->ring[i]) {
                aq_ring_free(&ring[AQ_VEC_TX_ID]);
-               aq_ring_free(&ring[AQ_VEC_RX_ID]);
+               if (i < self->rx_rings)
+                       aq_ring_free(&ring[AQ_VEC_RX_ID]);
        }
 
-       netif_napi_del(&self->napi);
-
-       kfree(self);
-
+       self->tx_rings = 0;
+       self->rx_rings = 0;
 err_exit:;
 }
 
@@ -333,16 +349,14 @@ cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self)
        return &self->aq_ring_param.affinity_mask;
 }
 
-void aq_vec_add_stats(struct aq_vec_s *self,
-                     struct aq_ring_stats_rx_s *stats_rx,
-                     struct aq_ring_stats_tx_s *stats_tx)
+static void aq_vec_add_stats(struct aq_vec_s *self,
+                            const unsigned int tc,
+                            struct aq_ring_stats_rx_s *stats_rx,
+                            struct aq_ring_stats_tx_s *stats_tx)
 {
-       struct aq_ring_s *ring = NULL;
-       unsigned int r = 0U;
+       struct aq_ring_s *ring = self->ring[tc];
 
-       for (r = 0U, ring = self->ring[0];
-               self->tx_rings > r; ++r, ring = self->ring[r]) {
-               struct aq_ring_stats_tx_s *tx = &ring[AQ_VEC_TX_ID].stats.tx;
+       if (tc < self->rx_rings) {
                struct aq_ring_stats_rx_s *rx = &ring[AQ_VEC_RX_ID].stats.rx;
 
                stats_rx->packets += rx->packets;
@@ -353,6 +367,10 @@ void aq_vec_add_stats(struct aq_vec_s *self,
                stats_rx->pg_losts += rx->pg_losts;
                stats_rx->pg_flips += rx->pg_flips;
                stats_rx->pg_reuses += rx->pg_reuses;
+       }
+
+       if (tc < self->tx_rings) {
+               struct aq_ring_stats_tx_s *tx = &ring[AQ_VEC_TX_ID].stats.tx;
 
                stats_tx->packets += tx->packets;
                stats_tx->bytes += tx->bytes;
@@ -361,7 +379,8 @@ void aq_vec_add_stats(struct aq_vec_s *self,
        }
 }
 
-int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data, unsigned int *p_count)
+int aq_vec_get_sw_stats(struct aq_vec_s *self, const unsigned int tc, u64 *data,
+                       unsigned int *p_count)
 {
        struct aq_ring_stats_rx_s stats_rx;
        struct aq_ring_stats_tx_s stats_tx;
@@ -369,7 +388,8 @@ int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data, unsigned int *p_count)
 
        memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
        memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
-       aq_vec_add_stats(self, &stats_rx, &stats_tx);
+
+       aq_vec_add_stats(self, tc, &stats_rx, &stats_tx);
 
        /* This data should mimic aq_ethtool_queue_stat_names structure
         */
index 0fe8e0904c7fb359c2901b219df14b2f6e69eba2..541af85e6510a697b59a9b02403e4bca9a5bb1a1 100644 (file)
@@ -25,17 +25,17 @@ irqreturn_t aq_vec_isr(int irq, void *private);
 irqreturn_t aq_vec_isr_legacy(int irq, void *private);
 struct aq_vec_s *aq_vec_alloc(struct aq_nic_s *aq_nic, unsigned int idx,
                              struct aq_nic_cfg_s *aq_nic_cfg);
+int aq_vec_ring_alloc(struct aq_vec_s *self, struct aq_nic_s *aq_nic,
+                     unsigned int idx, struct aq_nic_cfg_s *aq_nic_cfg);
 int aq_vec_init(struct aq_vec_s *self, const struct aq_hw_ops *aq_hw_ops,
                struct aq_hw_s *aq_hw);
 void aq_vec_deinit(struct aq_vec_s *self);
 void aq_vec_free(struct aq_vec_s *self);
+void aq_vec_ring_free(struct aq_vec_s *self);
 int aq_vec_start(struct aq_vec_s *self);
 void aq_vec_stop(struct aq_vec_s *self);
 cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self);
-int aq_vec_get_sw_stats(struct aq_vec_s *self, u64 *data,
+int aq_vec_get_sw_stats(struct aq_vec_s *self, const unsigned int tc, u64 *data,
                        unsigned int *p_count);
-void aq_vec_add_stats(struct aq_vec_s *self,
-                     struct aq_ring_stats_rx_s *stats_rx,
-                     struct aq_ring_stats_tx_s *stats_tx);
 
 #endif /* AQ_VEC_H */
index 1b0670a8ae33a8d4b8e84cabd7f40b84b3006d86..a312864969afe0fba5182413089881fb0125a4d2 100644 (file)
@@ -21,7 +21,7 @@
        .msix_irqs = 4U,                  \
        .irq_mask = ~0U,                  \
        .vecs = HW_ATL_A0_RSS_MAX,        \
-       .tcs = HW_ATL_A0_TC_MAX,          \
+       .tcs_max = HW_ATL_A0_TC_MAX,      \
        .rxd_alignment = 1U,              \
        .rxd_size = HW_ATL_A0_RXD_SIZE,   \
        .rxds_max = HW_ATL_A0_MAX_RXD,    \
@@ -136,10 +136,10 @@ static int hw_atl_a0_hw_qos_set(struct aq_hw_s *self)
        hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
        hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U);
 
-       hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF, 0U);
-       hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0x64, 0U);
-       hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, 0U);
-       hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, 0U);
+       hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0U, 0xFFF);
+       hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0U, 0x64);
+       hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0U, 0x50);
+       hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0U, 0x1E);
 
        /* Tx buf size */
        buff_size = HW_ATL_A0_TXBUF_MAX;
index fa3cd7e9954bac3fd7f22a50fdaf20915bc5f2ab..14d79f70cad779ecf924f052ef5b4d967d1eb518 100644 (file)
@@ -23,7 +23,7 @@
        .msix_irqs = 8U,                  \
        .irq_mask = ~0U,                  \
        .vecs = HW_ATL_B0_RSS_MAX,        \
-       .tcs = HW_ATL_B0_TC_MAX,          \
+       .tcs_max = HW_ATL_B0_TC_MAX,      \
        .rxd_alignment = 1U,              \
        .rxd_size = HW_ATL_B0_RXD_SIZE,   \
        .rxds_max = HW_ATL_B0_MAX_RXD,    \
@@ -46,7 +46,8 @@
                        NETIF_F_HW_VLAN_CTAG_RX |     \
                        NETIF_F_HW_VLAN_CTAG_TX |     \
                        NETIF_F_GSO_UDP_L4      |     \
-                       NETIF_F_GSO_PARTIAL,          \
+                       NETIF_F_GSO_PARTIAL |         \
+                       NETIF_F_HW_TC,                \
        .hw_priv_flags = IFF_UNICAST_FLT, \
        .flow_control = true,             \
        .mtu = HW_ATL_B0_MTU_JUMBO,       \
@@ -114,12 +115,34 @@ static int hw_atl_b0_set_fc(struct aq_hw_s *self, u32 fc, u32 tc)
        return 0;
 }
 
+static int hw_atl_b0_tc_ptp_set(struct aq_hw_s *self)
+{
+       /* Init TC2 for PTP_TX */
+       hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_TXBUF_SIZE,
+                                              AQ_HW_PTP_TC);
+
+       /* Init TC2 for PTP_RX */
+       hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_RXBUF_SIZE,
+                                              AQ_HW_PTP_TC);
+       /* No flow control for PTP */
+       hw_atl_rpb_rx_xoff_en_per_tc_set(self, 0U, AQ_HW_PTP_TC);
+
+       return aq_hw_err_from_flags(self);
+}
+
 static int hw_atl_b0_hw_qos_set(struct aq_hw_s *self)
 {
-       unsigned int i_priority = 0U;
-       u32 buff_size = 0U;
+       struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
+       u32 tx_buff_size = HW_ATL_B0_TXBUF_MAX;
+       u32 rx_buff_size = HW_ATL_B0_RXBUF_MAX;
+       unsigned int prio = 0U;
        u32 tc = 0U;
 
+       if (cfg->is_ptp) {
+               tx_buff_size -= HW_ATL_B0_PTP_TXBUF_SIZE;
+               rx_buff_size -= HW_ATL_B0_PTP_RXBUF_SIZE;
+       }
+
        /* TPS Descriptor rate init */
        hw_atl_tps_tx_pkt_shed_desc_rate_curr_time_res_set(self, 0x0U);
        hw_atl_tps_tx_pkt_shed_desc_rate_lim_set(self, 0xA);
@@ -127,63 +150,39 @@ static int hw_atl_b0_hw_qos_set(struct aq_hw_s *self)
        /* TPS VM init */
        hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(self, 0U);
 
-       /* TPS TC credits init */
-       hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
-       hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U);
-
-       tc = 0;
-
-       /* TX Packet Scheduler Data TC0 */
-       hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF, tc);
-       hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, 0x64, tc);
-       hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, tc);
-       hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, tc);
-
-       /* Tx buf size TC0 */
-       buff_size = HW_ATL_B0_TXBUF_MAX - HW_ATL_B0_PTP_TXBUF_SIZE;
-
-       hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, buff_size, tc);
-       hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self,
-                                                  (buff_size *
-                                                  (1024 / 32U) * 66U) /
-                                                  100U, tc);
-       hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self,
-                                                  (buff_size *
-                                                  (1024 / 32U) * 50U) /
-                                                  100U, tc);
-       /* Init TC2 for PTP_TX */
-       tc = 2;
+       tx_buff_size /= cfg->tcs;
+       rx_buff_size /= cfg->tcs;
+       for (tc = 0; tc < cfg->tcs; tc++) {
+               u32 threshold = 0U;
 
-       hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_TXBUF_SIZE,
-                                              tc);
+               /* Tx buf size TC0 */
+               hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc);
 
-       /* QoS Rx buf size per TC */
-       tc = 0;
-       buff_size = HW_ATL_B0_RXBUF_MAX - HW_ATL_B0_PTP_RXBUF_SIZE;
+               threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U;
+               hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc);
 
-       hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, buff_size, tc);
-       hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self,
-                                                  (buff_size *
-                                                  (1024U / 32U) * 66U) /
-                                                  100U, tc);
-       hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self,
-                                                  (buff_size *
-                                                  (1024U / 32U) * 50U) /
-                                                  100U, tc);
+               threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U;
+               hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc);
 
-       hw_atl_b0_set_fc(self, self->aq_nic_cfg->fc.req, tc);
+               /* QoS Rx buf size per TC */
+               hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc);
 
-       /* Init TC2 for PTP_RX */
-       tc = 2;
+               threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U;
+               hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc);
 
-       hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, HW_ATL_B0_PTP_RXBUF_SIZE,
-                                              tc);
-       /* No flow control for PTP */
-       hw_atl_rpb_rx_xoff_en_per_tc_set(self, 0U, tc);
+               threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U;
+               hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+
+               hw_atl_b0_set_fc(self, self->aq_nic_cfg->fc.req, tc);
+       }
+
+       if (cfg->is_ptp)
+               hw_atl_b0_tc_ptp_set(self);
 
        /* QoS 802.1p priority -> TC mapping */
-       for (i_priority = 8U; i_priority--;)
-               hw_atl_rpf_rpb_user_priority_tc_map_set(self, i_priority, 0U);
+       for (prio = 0; prio < 8; ++prio)
+               hw_atl_rpf_rpb_user_priority_tc_map_set(self, prio,
+                                                       cfg->prio_tc_map[prio]);
 
        return aq_hw_err_from_flags(self);
 }
@@ -311,10 +310,124 @@ int hw_atl_b0_hw_offload_set(struct aq_hw_s *self,
        return aq_hw_err_from_flags(self);
 }
 
+static int hw_atl_b0_hw_init_tx_tc_rate_limit(struct aq_hw_s *self)
+{
+       static const u32 max_weight = BIT(HW_ATL_TPS_DATA_TCTWEIGHT_WIDTH) - 1;
+       /* Scale factor is based on the number of bits in fractional portion */
+       static const u32 scale = BIT(HW_ATL_TPS_DESC_RATE_Y_WIDTH);
+       static const u32 frac_msk = HW_ATL_TPS_DESC_RATE_Y_MSK >>
+                                   HW_ATL_TPS_DESC_RATE_Y_SHIFT;
+       const u32 link_speed = self->aq_link_status.mbps;
+       struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+       unsigned long num_min_rated_tcs = 0;
+       u32 tc_weight[AQ_CFG_TCS_MAX];
+       u32 fixed_max_credit;
+       u8 min_rate_msk = 0;
+       u32 sum_weight = 0;
+       int tc;
+
+       /* By default max_credit is based upon MTU (in unit of 64b) */
+       fixed_max_credit = nic_cfg->aq_hw_caps->mtu / 64;
+
+       if (link_speed) {
+               min_rate_msk = nic_cfg->tc_min_rate_msk &
+                              (BIT(nic_cfg->tcs) - 1);
+               num_min_rated_tcs = hweight8(min_rate_msk);
+       }
+
+       /* First, calculate weights where min_rate is specified */
+       if (num_min_rated_tcs) {
+               for (tc = 0; tc != nic_cfg->tcs; tc++) {
+                       if (!nic_cfg->tc_min_rate[tc]) {
+                               tc_weight[tc] = 0;
+                               continue;
+                       }
+
+                       tc_weight[tc] = (-1L + link_speed +
+                                        nic_cfg->tc_min_rate[tc] *
+                                        max_weight) /
+                                       link_speed;
+                       tc_weight[tc] = min(tc_weight[tc], max_weight);
+                       sum_weight += tc_weight[tc];
+               }
+       }
+
+       /* WSP, if min_rate is set for at least one TC.
+        * RR otherwise.
+        */
+       hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, min_rate_msk ? 1U : 0U);
+       /* Data TC Arbiter takes precedence over Descriptor TC Arbiter,
+        * leave Descriptor TC Arbiter as RR.
+        */
+       hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
+
+       hw_atl_tps_tx_desc_rate_mode_set(self, nic_cfg->is_qos ? 1U : 0U);
+
+       for (tc = 0; tc != nic_cfg->tcs; tc++) {
+               const u32 en = (nic_cfg->tc_max_rate[tc] != 0) ? 1U : 0U;
+               const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+               u32 weight, max_credit;
+
+               hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, tc,
+                                                             fixed_max_credit);
+               hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, tc, 0x1E);
+
+               if (num_min_rated_tcs) {
+                       weight = tc_weight[tc];
+
+                       if (!weight && sum_weight < max_weight)
+                               weight = (max_weight - sum_weight) /
+                                        (nic_cfg->tcs - num_min_rated_tcs);
+                       else if (!weight)
+                               weight = 0x64;
+
+                       max_credit = max(8 * weight, fixed_max_credit);
+               } else {
+                       weight = 0x64;
+                       max_credit = 0xFFF;
+               }
+
+               hw_atl_tps_tx_pkt_shed_tc_data_weight_set(self, tc, weight);
+               hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(self, tc,
+                                                             max_credit);
+
+               hw_atl_tps_tx_desc_rate_en_set(self, desc, en);
+
+               if (en) {
+                       /* Nominal rate is always 10G */
+                       const u32 rate = 10000U * scale /
+                                        nic_cfg->tc_max_rate[tc];
+                       const u32 rate_int = rate >>
+                                            HW_ATL_TPS_DESC_RATE_Y_WIDTH;
+                       const u32 rate_frac = rate & frac_msk;
+
+                       hw_atl_tps_tx_desc_rate_x_set(self, desc, rate_int);
+                       hw_atl_tps_tx_desc_rate_y_set(self, desc, rate_frac);
+               } else {
+                       /* A value of 1 indicates the queue is not
+                        * rate controlled.
+                        */
+                       hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+                       hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+               }
+       }
+       for (tc = nic_cfg->tcs; tc != AQ_CFG_TCS_MAX; tc++) {
+               const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+
+               hw_atl_tps_tx_desc_rate_en_set(self, desc, 0U);
+               hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+               hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+       }
+
+       return aq_hw_err_from_flags(self);
+}
+
 static int hw_atl_b0_hw_init_tx_path(struct aq_hw_s *self)
 {
+       struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+
        /* Tx TC/Queue number config */
-       hw_atl_tpb_tps_tx_tc_mode_set(self, 1U);
+       hw_atl_tpb_tps_tx_tc_mode_set(self, nic_cfg->tc_mode);
 
        hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U);
        hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U);
@@ -334,20 +447,32 @@ static int hw_atl_b0_hw_init_tx_path(struct aq_hw_s *self)
        return aq_hw_err_from_flags(self);
 }
 
+void hw_atl_b0_hw_init_rx_rss_ctrl1(struct aq_hw_s *self)
+{
+       struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
+       u32 rss_ctrl1 = HW_ATL_RSS_DISABLED;
+
+       if (cfg->is_rss)
+               rss_ctrl1 = (cfg->tc_mode == AQ_TC_MODE_8TCS) ?
+                           HW_ATL_RSS_ENABLED_8TCS_2INDEX_BITS :
+                           HW_ATL_RSS_ENABLED_4TCS_3INDEX_BITS;
+
+       hw_atl_reg_rx_flr_rss_control1set(self, rss_ctrl1);
+}
+
 static int hw_atl_b0_hw_init_rx_path(struct aq_hw_s *self)
 {
        struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
        int i;
 
        /* Rx TC/RSS number config */
-       hw_atl_rpb_rpf_rx_traf_class_mode_set(self, 1U);
+       hw_atl_rpb_rpf_rx_traf_class_mode_set(self, cfg->tc_mode);
 
        /* Rx flow control */
        hw_atl_rpb_rx_flow_ctl_mode_set(self, 1U);
 
        /* RSS Ring selection */
-       hw_atl_reg_rx_flr_rss_control1set(self, cfg->is_rss ?
-                                       0xB3333333U : 0x00000000U);
+       hw_atl_b0_hw_init_rx_rss_ctrl1(self);
 
        /* Multicast filters */
        for (i = HW_ATL_B0_MAC_MAX; i--;) {
@@ -1078,18 +1203,6 @@ int hw_atl_b0_hw_ring_rx_stop(struct aq_hw_s *self, struct aq_ring_s *ring)
        return aq_hw_err_from_flags(self);
 }
 
-static int hw_atl_b0_tx_tc_mode_get(struct aq_hw_s *self, u32 *tc_mode)
-{
-       *tc_mode = hw_atl_tpb_tps_tx_tc_mode_get(self);
-       return aq_hw_err_from_flags(self);
-}
-
-static int hw_atl_b0_rx_tc_mode_get(struct aq_hw_s *self, u32 *tc_mode)
-{
-       *tc_mode = hw_atl_rpb_rpf_rx_traf_class_mode_get(self);
-       return aq_hw_err_from_flags(self);
-}
-
 #define get_ptp_ts_val_u64(self, indx) \
        ((u64)(hw_atl_pcs_ptp_clock_get(self, indx) & 0xffff))
 
@@ -1503,13 +1616,11 @@ const struct aq_hw_ops hw_atl_ops_b0 = {
        .hw_interrupt_moderation_set = hw_atl_b0_hw_interrupt_moderation_set,
        .hw_rss_set                  = hw_atl_b0_hw_rss_set,
        .hw_rss_hash_set             = hw_atl_b0_hw_rss_hash_set,
+       .hw_tc_rate_limit_set        = hw_atl_b0_hw_init_tx_tc_rate_limit,
        .hw_get_regs                 = hw_atl_utils_hw_get_regs,
        .hw_get_hw_stats             = hw_atl_utils_get_hw_stats,
        .hw_get_fw_version           = hw_atl_utils_get_fw_version,
 
-       .hw_tx_tc_mode_get       = hw_atl_b0_tx_tc_mode_get,
-       .hw_rx_tc_mode_get       = hw_atl_b0_rx_tc_mode_get,
-
        .hw_ring_hwts_rx_fill        = hw_atl_b0_hw_ring_hwts_rx_fill,
        .hw_ring_hwts_rx_receive     = hw_atl_b0_hw_ring_hwts_rx_receive,
 
index b855459272caa1f28ed95b3f96aa4b74a23f160c..30f468f2084d5ffb48a1a3eba53f6f4a8c81e8b6 100644 (file)
@@ -58,6 +58,8 @@ int hw_atl_b0_hw_ring_tx_head_update(struct aq_hw_s *self,
 int hw_atl_b0_hw_ring_tx_stop(struct aq_hw_s *self, struct aq_ring_s *ring);
 int hw_atl_b0_hw_ring_rx_stop(struct aq_hw_s *self, struct aq_ring_s *ring);
 
+void hw_atl_b0_hw_init_rx_rss_ctrl1(struct aq_hw_s *self);
+
 int hw_atl_b0_hw_mac_addr_set(struct aq_hw_s *self, u8 *mac_addr);
 
 int hw_atl_b0_hw_start(struct aq_hw_s *self);
index 7ab23a1751d37dd7cfb3027de826dc05171263fe..cf460d61a45e117b82f5d4835e923c3a8db69d81 100644 (file)
@@ -75,7 +75,7 @@
 #define HW_ATL_B0_RSS_HASHKEY_BITS 320U
 
 #define HW_ATL_B0_TCRSS_4_8  1
-#define HW_ATL_B0_TC_MAX 1U
+#define HW_ATL_B0_TC_MAX 8U
 #define HW_ATL_B0_RSS_MAX 8U
 
 #define HW_ATL_B0_LRO_RXD_MAX 16U
 #define HW_ATL_B0_MAX_RXD 8184U
 #define HW_ATL_B0_MAX_TXD 8184U
 
+#define HW_ATL_RSS_DISABLED 0x00000000U
+#define HW_ATL_RSS_ENABLED_8TCS_2INDEX_BITS 0xA2222222U
+#define HW_ATL_RSS_ENABLED_4TCS_3INDEX_BITS 0x80003333U
+
 /* HW layer capabilities */
 
 #endif /* HW_ATL_B0_INTERNAL_H */
index 9e2d01a6aac8293e205ae7d0f3cb2a6b66028c6f..3c8e8047ea1ed0ca6cca95fbfa29b1c9953f9db0 100644 (file)
@@ -754,7 +754,7 @@ void hw_atl_rpfl2_accept_all_mc_packets_set(struct aq_hw_s *aq_hw,
 }
 
 void hw_atl_rpf_rpb_user_priority_tc_map_set(struct aq_hw_s *aq_hw,
-                                            u32 user_priority_tc_map, u32 tc)
+                                            u32 user_priority, u32 tc)
 {
 /* register address for bitfield rx_tc_up{t}[2:0] */
        static u32 rpf_rpb_rx_tc_upt_adr[8] = {
@@ -773,10 +773,9 @@ void hw_atl_rpf_rpb_user_priority_tc_map_set(struct aq_hw_s *aq_hw,
                        0U, 4U, 8U, 12U, 16U, 20U, 24U, 28U
                };
 
-       aq_hw_write_reg_bit(aq_hw, rpf_rpb_rx_tc_upt_adr[tc],
-                           rpf_rpb_rx_tc_upt_msk[tc],
-                           rpf_rpb_rx_tc_upt_shft[tc],
-                           user_priority_tc_map);
+       aq_hw_write_reg_bit(aq_hw, rpf_rpb_rx_tc_upt_adr[user_priority],
+                           rpf_rpb_rx_tc_upt_msk[user_priority],
+                           rpf_rpb_rx_tc_upt_shft[user_priority], tc);
 }
 
 void hw_atl_rpf_rss_key_addr_set(struct aq_hw_s *aq_hw, u32 rss_key_addr)
@@ -1464,8 +1463,8 @@ void hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(struct aq_hw_s *aq_hw,
 }
 
 void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw,
-                                                  u32 max_credit,
-                                                  u32 tc)
+                                                  const u32 tc,
+                                                  const u32 max_credit)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_TCTCREDIT_MAX_ADR(tc),
                            HW_ATL_TPS_DESC_TCTCREDIT_MAX_MSK,
@@ -1474,13 +1473,13 @@ void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw,
 }
 
 void hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(struct aq_hw_s *aq_hw,
-                                              u32 tx_pkt_shed_desc_tc_weight,
-                                              u32 tc)
+                                              const u32 tc,
+                                              const u32 weight)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_TCTWEIGHT_ADR(tc),
                            HW_ATL_TPS_DESC_TCTWEIGHT_MSK,
                            HW_ATL_TPS_DESC_TCTWEIGHT_SHIFT,
-                           tx_pkt_shed_desc_tc_weight);
+                           weight);
 }
 
 void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw,
@@ -1493,8 +1492,8 @@ void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw,
 }
 
 void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
-                                                  u32 max_credit,
-                                                  u32 tc)
+                                                  const u32 tc,
+                                                  const u32 max_credit)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DATA_TCTCREDIT_MAX_ADR(tc),
                            HW_ATL_TPS_DATA_TCTCREDIT_MAX_MSK,
@@ -1503,13 +1502,49 @@ void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
 }
 
 void hw_atl_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
-                                              u32 tx_pkt_shed_tc_data_weight,
-                                              u32 tc)
+                                              const u32 tc,
+                                              const u32 weight)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DATA_TCTWEIGHT_ADR(tc),
                            HW_ATL_TPS_DATA_TCTWEIGHT_MSK,
                            HW_ATL_TPS_DATA_TCTWEIGHT_SHIFT,
-                           tx_pkt_shed_tc_data_weight);
+                           weight);
+}
+
+void hw_atl_tps_tx_desc_rate_mode_set(struct aq_hw_s *aq_hw,
+                                     const u32 rate_mode)
+{
+       aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_TX_DESC_RATE_MODE_ADR,
+                           HW_ATL_TPS_TX_DESC_RATE_MODE_MSK,
+                           HW_ATL_TPS_TX_DESC_RATE_MODE_SHIFT,
+                           rate_mode);
+}
+
+void hw_atl_tps_tx_desc_rate_en_set(struct aq_hw_s *aq_hw, const u32 desc,
+                                   const u32 enable)
+{
+       aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_EN_ADR(desc),
+                           HW_ATL_TPS_DESC_RATE_EN_MSK,
+                           HW_ATL_TPS_DESC_RATE_EN_SHIFT,
+                           enable);
+}
+
+void hw_atl_tps_tx_desc_rate_x_set(struct aq_hw_s *aq_hw, const u32 desc,
+                                  const u32 rate_int)
+{
+       aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_X_ADR(desc),
+                           HW_ATL_TPS_DESC_RATE_X_MSK,
+                           HW_ATL_TPS_DESC_RATE_X_SHIFT,
+                           rate_int);
+}
+
+void hw_atl_tps_tx_desc_rate_y_set(struct aq_hw_s *aq_hw, const u32 desc,
+                                  const u32 rate_frac)
+{
+       aq_hw_write_reg_bit(aq_hw, HW_ATL_TPS_DESC_RATE_Y_ADR(desc),
+                           HW_ATL_TPS_DESC_RATE_Y_MSK,
+                           HW_ATL_TPS_DESC_RATE_Y_SHIFT,
+                           rate_frac);
 }
 
 /* tx */
index b88cb84805d5b81e064ab3a74a7beacaf6705224..61a6f70c51cd9b8c15ccd5d410bbd5dc0214fa7e 100644 (file)
@@ -688,13 +688,13 @@ void hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(struct aq_hw_s *aq_hw,
 
 /* set tx packet scheduler descriptor tc max credit */
 void hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(struct aq_hw_s *aq_hw,
-                                                  u32 max_credit,
-                                           u32 tc);
+                                                  const u32 tc,
+                                                  const u32 max_credit);
 
 /* set tx packet scheduler descriptor tc weight */
 void hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(struct aq_hw_s *aq_hw,
-                                              u32 tx_pkt_shed_desc_tc_weight,
-                                       u32 tc);
+                                              const u32 tc,
+                                              const u32 weight);
 
 /* set tx packet scheduler descriptor vm arbitration mode */
 void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw,
@@ -702,13 +702,29 @@ void hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(struct aq_hw_s *aq_hw,
 
 /* set tx packet scheduler tc data max credit */
 void hw_atl_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
-                                                  u32 max_credit,
-                                           u32 tc);
+                                                  const u32 tc,
+                                                  const u32 max_credit);
 
 /* set tx packet scheduler tc data weight */
 void hw_atl_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
-                                              u32 tx_pkt_shed_tc_data_weight,
-                                       u32 tc);
+                                              const u32 tc,
+                                              const u32 weight);
+
+/* set tx descriptor rate mode */
+void hw_atl_tps_tx_desc_rate_mode_set(struct aq_hw_s *aq_hw,
+                                     const u32 rate_mode);
+
+/* set tx packet scheduler descriptor rate enable */
+void hw_atl_tps_tx_desc_rate_en_set(struct aq_hw_s *aq_hw, const u32 desc,
+                                   const u32 enable);
+
+/* set tx packet scheduler descriptor rate integral value */
+void hw_atl_tps_tx_desc_rate_x_set(struct aq_hw_s *aq_hw, const u32 desc,
+                                  const u32 rate_int);
+
+/* set tx packet scheduler descriptor rate fractional value */
+void hw_atl_tps_tx_desc_rate_y_set(struct aq_hw_s *aq_hw, const u32 desc,
+                                  const u32 rate_frac);
 
 /* tx */
 
index 18de2f7b895938a439add1e01a42cea90e9904e5..06220792daf152fdfb0bb97b89f6f81704193c2f 100644 (file)
 /* default value of bitfield lso_tcp_flag_mid[b:0] */
 #define HW_ATL_THM_LSO_TCP_FLAG_MID_DEFAULT 0x0
 
+/* tx tx_tc_mode bitfield definitions
+ * preprocessor definitions for the bitfield "tx_tc_mode".
+ * port="pif_tpb_tx_tc_mode_i,pif_tps_tx_tc_mode_i"
+ */
+
+/* register address for bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900
+/* bitmask for bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100
+/* inverted bitmask for bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF
+/* lower bit position of bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8
+/* width of bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1
+/* default value of bitfield tx_tc_mode */
+#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0
+
+/* tx tx_desc_rate_mode bitfield definitions
+ * preprocessor definitions for the bitfield "tx_desc_rate_mode".
+ * port="pif_tps_desc_rate_mode_i"
+ */
+
+/* register address for bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_ADR 0x00007900
+/* bitmask for bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_MSK 0x00000080
+/* inverted bitmask for bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_MSKN 0xFFFFFF7F
+/* lower bit position of bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_SHIFT 7
+/* width of bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_WIDTH 1
+/* default value of bitfield tx_desc_rate_mode */
+#define HW_ATL_TPS_TX_DESC_RATE_MODE_DEFAULT 0x0
+
 /* tx tx_buf_en bitfield definitions
  * preprocessor definitions for the bitfield "tx_buf_en".
  * port="pif_tpb_tx_buf_en_i"
 /* default value of bitfield tx_buf_en */
 #define HW_ATL_TPB_TX_BUF_EN_DEFAULT 0x0
 
-/* register address for bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900
-/* bitmask for bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100
-/* inverted bitmask for bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF
-/* lower bit position of bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8
-/* width of bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1
-/* default value of bitfield tx_tc_mode */
-#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0
-
 /* tx tx{b}_hi_thresh[c:0] bitfield definitions
  * preprocessor definitions for the bitfield "tx{b}_hi_thresh[c:0]".
  * parameter: buffer {b} | stride size 0x10 | range [0, 7]
 /* default value of bitfield data_tc_arb_mode */
 #define HW_ATL_TPS_DATA_TC_ARB_MODE_DEFAULT 0x0
 
+/* tx desc{r}_rate_en bitfield definitions
+ * preprocessor definitions for the bitfield "desc{r}_rate_en".
+ * port="pif_tps_desc_rate_en_i[0]"
+ */
+
+/* register address for bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_ADR(desc) (0x00007408 + (desc) * 0x10)
+/* bitmask for bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_MSK 0x80000000
+/* inverted bitmask for bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_MSKN 0x7FFFFFFF
+/* lower bit position of bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_SHIFT 31
+/* width of bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_WIDTH 1
+/* default value of bitfield desc{r}_rate_en */
+#define HW_ATL_TPS_DESC_RATE_EN_DEFAULT 0x0
+
+/* tx desc{r}_rate_x bitfield definitions
+ * preprocessor definitions for the bitfield "desc{r}_rate_x".
+ * port="pif_tps_desc0_rate_x"
+ */
+/* register address for bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_ADR(desc) (0x00007408 + (desc) * 0x10)
+/* bitmask for bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_MSK 0x03FF0000
+/* inverted bitmask for bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_MSKN 0xFC00FFFF
+/* lower bit position of bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_SHIFT 16
+/* width of bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_WIDTH 10
+/* default value of bitfield desc{r}_rate_x */
+#define HW_ATL_TPS_DESC_RATE_X_DEFAULT 0x0
+
+/* tx desc{r}_rate_y bitfield definitions
+ * preprocessor definitions for the bitfield "desc{r}_rate_y".
+ * port="pif_tps_desc0_rate_y"
+ */
+/* register address for bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_ADR(desc) (0x00007408 + (desc) * 0x10)
+/* bitmask for bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_MSK 0x00003FFF
+/* inverted bitmask for bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_MSKN 0xFFFFC000
+/* lower bit position of bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_SHIFT 0
+/* width of bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_WIDTH 14
+/* default value of bitfield desc{r}_rate_y */
+#define HW_ATL_TPS_DESC_RATE_Y_DEFAULT 0x0
+
 /* tx desc_rate_ta_rst bitfield definitions
  * preprocessor definitions for the bitfield "desc_rate_ta_rst".
  * port="pif_tps_desc_rate_ta_rst_i"
index 6f2b33ae3d063b0eb7b32bcbd75de7926833a49b..8df9d4ef36f01555f0d4775ee4d4e90597a56eee 100644 (file)
@@ -10,6 +10,7 @@
 #include "hw_atl/hw_atl_b0.h"
 #include "hw_atl/hw_atl_utils.h"
 #include "hw_atl/hw_atl_llh.h"
+#include "hw_atl/hw_atl_llh_internal.h"
 #include "hw_atl2_utils.h"
 #include "hw_atl2_llh.h"
 #include "hw_atl2_internal.h"
@@ -23,7 +24,7 @@ static int hw_atl2_act_rslvr_table_set(struct aq_hw_s *self, u8 location,
        .msix_irqs = 8U,                  \
        .irq_mask = ~0U,                  \
        .vecs = HW_ATL2_RSS_MAX,          \
-       .tcs = HW_ATL2_TC_MAX,    \
+       .tcs_max = HW_ATL2_TC_MAX,        \
        .rxd_alignment = 1U,              \
        .rxd_size = HW_ATL2_RXD_SIZE,   \
        .rxds_max = HW_ATL2_MAX_RXD,    \
@@ -47,7 +48,8 @@ static int hw_atl2_act_rslvr_table_set(struct aq_hw_s *self, u8 location,
                        NETIF_F_HW_VLAN_CTAG_RX |     \
                        NETIF_F_HW_VLAN_CTAG_TX |     \
                        NETIF_F_GSO_UDP_L4      |     \
-                       NETIF_F_GSO_PARTIAL,          \
+                       NETIF_F_GSO_PARTIAL     |     \
+                       NETIF_F_HW_TC,                \
        .hw_priv_flags = IFF_UNICAST_FLT, \
        .flow_control = true,             \
        .mtu = HW_ATL2_MTU_JUMBO,         \
@@ -91,16 +93,49 @@ static int hw_atl2_hw_reset(struct aq_hw_s *self)
 
 static int hw_atl2_hw_queue_to_tc_map_set(struct aq_hw_s *self)
 {
-       if (!hw_atl_rpb_rpf_rx_traf_class_mode_get(self)) {
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(0), 0x11110000);
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(8), 0x33332222);
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(16), 0x55554444);
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(24), 0x77776666);
-       } else {
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(0), 0x00000000);
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(8), 0x11111111);
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(16), 0x22222222);
-               aq_hw_write_reg(self, HW_ATL2_RX_Q_TC_MAP_ADR(24), 0x33333333);
+       struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
+       unsigned int tcs, q_per_tc;
+       unsigned int tc, q;
+       u32 rx_map = 0;
+       u32 tx_map = 0;
+
+       hw_atl2_tpb_tx_tc_q_rand_map_en_set(self, 1U);
+
+       switch (cfg->tc_mode) {
+       case AQ_TC_MODE_8TCS:
+               tcs = 8;
+               q_per_tc = 4;
+               break;
+       case AQ_TC_MODE_4TCS:
+               tcs = 4;
+               q_per_tc = 8;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       for (tc = 0; tc != tcs; tc++) {
+               unsigned int tc_q_offset = tc * q_per_tc;
+
+               for (q = tc_q_offset; q != tc_q_offset + q_per_tc; q++) {
+                       rx_map |= tc << HW_ATL2_RX_Q_TC_MAP_SHIFT(q);
+                       if (HW_ATL2_RX_Q_TC_MAP_ADR(q) !=
+                           HW_ATL2_RX_Q_TC_MAP_ADR(q + 1)) {
+                               aq_hw_write_reg(self,
+                                               HW_ATL2_RX_Q_TC_MAP_ADR(q),
+                                               rx_map);
+                               rx_map = 0;
+                       }
+
+                       tx_map |= tc << HW_ATL2_TX_Q_TC_MAP_SHIFT(q);
+                       if (HW_ATL2_TX_Q_TC_MAP_ADR(q) !=
+                           HW_ATL2_TX_Q_TC_MAP_ADR(q + 1)) {
+                               aq_hw_write_reg(self,
+                                               HW_ATL2_TX_Q_TC_MAP_ADR(q),
+                                               tx_map);
+                               tx_map = 0;
+                       }
+               }
        }
 
        return aq_hw_err_from_flags(self);
@@ -112,7 +147,6 @@ static int hw_atl2_hw_qos_set(struct aq_hw_s *self)
        u32 tx_buff_size = HW_ATL2_TXBUF_MAX;
        u32 rx_buff_size = HW_ATL2_RXBUF_MAX;
        unsigned int prio = 0U;
-       u32 threshold = 0U;
        u32 tc = 0U;
 
        /* TPS Descriptor rate init */
@@ -122,42 +156,36 @@ static int hw_atl2_hw_qos_set(struct aq_hw_s *self)
        /* TPS VM init */
        hw_atl_tps_tx_pkt_shed_desc_vm_arb_mode_set(self, 0U);
 
-       /* TPS TC credits init */
-       hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
-       hw_atl_tps_tx_pkt_shed_data_arb_mode_set(self, 0U);
+       tx_buff_size /= cfg->tcs;
+       rx_buff_size /= cfg->tcs;
+       for (tc = 0; tc < cfg->tcs; tc++) {
+               u32 threshold = 0U;
 
-       tc = 0;
+               /* Tx buf size TC0 */
+               hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc);
 
-       /* TX Packet Scheduler Data TC0 */
-       hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(self, 0xFFF0, tc);
-       hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(self, 0x640, tc);
-       hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, 0x50, tc);
-       hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, 0x1E, tc);
+               threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U;
+               hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc);
 
-       /* Tx buf size TC0 */
-       hw_atl_tpb_tx_pkt_buff_size_per_tc_set(self, tx_buff_size, tc);
+               threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U;
+               hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc);
 
-       threshold = (tx_buff_size * (1024 / 32U) * 66U) / 100U;
-       hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(self, threshold, tc);
+               /* QoS Rx buf size per TC */
+               hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc);
 
-       threshold = (tx_buff_size * (1024 / 32U) * 50U) / 100U;
-       hw_atl_tpb_tx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+               threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U;
+               hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc);
 
-       /* QoS Rx buf size per TC */
-       hw_atl_rpb_rx_pkt_buff_size_per_tc_set(self, rx_buff_size, tc);
-
-       threshold = (rx_buff_size * (1024U / 32U) * 66U) / 100U;
-       hw_atl_rpb_rx_buff_hi_threshold_per_tc_set(self, threshold, tc);
-
-       threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U;
-       hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+               threshold = (rx_buff_size * (1024U / 32U) * 50U) / 100U;
+               hw_atl_rpb_rx_buff_lo_threshold_per_tc_set(self, threshold, tc);
+       }
 
        /* QoS 802.1p priority -> TC mapping */
        for (prio = 0; prio < 8; ++prio)
                hw_atl_rpf_rpb_user_priority_tc_map_set(self, prio,
-                                                       cfg->tcs * prio / 8);
+                                                       cfg->prio_tc_map[prio]);
 
-       /* ATL2 Apply legacy ring to TC mapping */
+       /* ATL2 Apply ring to TC mapping */
        hw_atl2_hw_queue_to_tc_map_set(self);
 
        return aq_hw_err_from_flags(self);
@@ -166,19 +194,149 @@ static int hw_atl2_hw_qos_set(struct aq_hw_s *self)
 static int hw_atl2_hw_rss_set(struct aq_hw_s *self,
                              struct aq_rss_parameters *rss_params)
 {
-       u8 *indirection_table = rss_params->indirection_table;
+       u8 *indirection_table = rss_params->indirection_table;
+       const u32 num_tcs = aq_hw_num_tcs(self);
+       u32 rpf_redir2_enable;
+       int tc;
        int i;
 
-       for (i = HW_ATL2_RSS_REDIRECTION_MAX; i--;)
-               hw_atl2_new_rpf_rss_redir_set(self, 0, i, indirection_table[i]);
+       rpf_redir2_enable = num_tcs > 4 ? 1 : 0;
+
+       hw_atl2_rpf_redirection_table2_select_set(self, rpf_redir2_enable);
+
+       for (i = HW_ATL2_RSS_REDIRECTION_MAX; i--;) {
+               for (tc = 0; tc != num_tcs; tc++) {
+                       hw_atl2_new_rpf_rss_redir_set(self, tc, i,
+                                                     tc *
+                                                     aq_hw_q_per_tc(self) +
+                                                     indirection_table[i]);
+               }
+       }
+
+       return aq_hw_err_from_flags(self);
+}
+
+static int hw_atl2_hw_init_tx_tc_rate_limit(struct aq_hw_s *self)
+{
+       static const u32 max_weight = BIT(HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH) - 1;
+       /* Scale factor is based on the number of bits in fractional portion */
+       static const u32 scale = BIT(HW_ATL_TPS_DESC_RATE_Y_WIDTH);
+       static const u32 frac_msk = HW_ATL_TPS_DESC_RATE_Y_MSK >>
+                                   HW_ATL_TPS_DESC_RATE_Y_SHIFT;
+       const u32 link_speed = self->aq_link_status.mbps;
+       struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+       unsigned long num_min_rated_tcs = 0;
+       u32 tc_weight[AQ_CFG_TCS_MAX];
+       u32 fixed_max_credit_4b;
+       u32 fixed_max_credit;
+       u8 min_rate_msk = 0;
+       u32 sum_weight = 0;
+       int tc;
+
+       /* By default max_credit is based upon MTU (in unit of 64b) */
+       fixed_max_credit = nic_cfg->aq_hw_caps->mtu / 64;
+       /* in unit of 4b */
+       fixed_max_credit_4b = nic_cfg->aq_hw_caps->mtu / 4;
+
+       if (link_speed) {
+               min_rate_msk = nic_cfg->tc_min_rate_msk &
+                              (BIT(nic_cfg->tcs) - 1);
+               num_min_rated_tcs = hweight8(min_rate_msk);
+       }
+
+       /* First, calculate weights where min_rate is specified */
+       if (num_min_rated_tcs) {
+               for (tc = 0; tc != nic_cfg->tcs; tc++) {
+                       if (!nic_cfg->tc_min_rate[tc]) {
+                               tc_weight[tc] = 0;
+                               continue;
+                       }
+
+                       tc_weight[tc] = (-1L + link_speed +
+                                        nic_cfg->tc_min_rate[tc] *
+                                        max_weight) /
+                                       link_speed;
+                       tc_weight[tc] = min(tc_weight[tc], max_weight);
+                       sum_weight += tc_weight[tc];
+               }
+       }
+
+       /* WSP, if min_rate is set for at least one TC.
+        * RR otherwise.
+        */
+       hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(self, min_rate_msk ? 1U : 0U);
+       /* Data TC Arbiter takes precedence over Descriptor TC Arbiter,
+        * leave Descriptor TC Arbiter as RR.
+        */
+       hw_atl_tps_tx_pkt_shed_desc_tc_arb_mode_set(self, 0U);
+
+       hw_atl_tps_tx_desc_rate_mode_set(self, nic_cfg->is_qos ? 1U : 0U);
+
+       for (tc = 0; tc != nic_cfg->tcs; tc++) {
+               const u32 en = (nic_cfg->tc_max_rate[tc] != 0) ? 1U : 0U;
+               const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+               u32 weight, max_credit;
+
+               hw_atl_tps_tx_pkt_shed_desc_tc_max_credit_set(self, tc,
+                                                             fixed_max_credit);
+               hw_atl_tps_tx_pkt_shed_desc_tc_weight_set(self, tc, 0x1E);
+
+               if (num_min_rated_tcs) {
+                       weight = tc_weight[tc];
+
+                       if (!weight && sum_weight < max_weight)
+                               weight = (max_weight - sum_weight) /
+                                        (nic_cfg->tcs - num_min_rated_tcs);
+                       else if (!weight)
+                               weight = 0x640;
+
+                       max_credit = max(2 * weight, fixed_max_credit_4b);
+               } else {
+                       weight = 0x640;
+                       max_credit = 0xFFF0;
+               }
+
+               hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(self, tc, weight);
+               hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(self, tc,
+                                                              max_credit);
+
+               hw_atl_tps_tx_desc_rate_en_set(self, desc, en);
+
+               if (en) {
+                       /* Nominal rate is always 10G */
+                       const u32 rate = 10000U * scale /
+                                        nic_cfg->tc_max_rate[tc];
+                       const u32 rate_int = rate >>
+                                            HW_ATL_TPS_DESC_RATE_Y_WIDTH;
+                       const u32 rate_frac = rate & frac_msk;
+
+                       hw_atl_tps_tx_desc_rate_x_set(self, desc, rate_int);
+                       hw_atl_tps_tx_desc_rate_y_set(self, desc, rate_frac);
+               } else {
+                       /* A value of 1 indicates the queue is not
+                        * rate controlled.
+                        */
+                       hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+                       hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+               }
+       }
+       for (tc = nic_cfg->tcs; tc != AQ_CFG_TCS_MAX; tc++) {
+               const u32 desc = AQ_NIC_CFG_TCVEC2RING(nic_cfg, tc, 0);
+
+               hw_atl_tps_tx_desc_rate_en_set(self, desc, 0U);
+               hw_atl_tps_tx_desc_rate_x_set(self, desc, 1U);
+               hw_atl_tps_tx_desc_rate_y_set(self, desc, 0U);
+       }
 
        return aq_hw_err_from_flags(self);
 }
 
 static int hw_atl2_hw_init_tx_path(struct aq_hw_s *self)
 {
+       struct aq_nic_cfg_s *nic_cfg = self->aq_nic_cfg;
+
        /* Tx TC/RSS number config */
-       hw_atl_tpb_tps_tx_tc_mode_set(self, 1U);
+       hw_atl_tpb_tps_tx_tc_mode_set(self, nic_cfg->tc_mode);
 
        hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U);
        hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U);
@@ -201,13 +359,29 @@ static int hw_atl2_hw_init_tx_path(struct aq_hw_s *self)
 static void hw_atl2_hw_init_new_rx_filters(struct aq_hw_s *self)
 {
        struct hw_atl2_priv *priv = (struct hw_atl2_priv *)self->priv;
+       u8 *prio_tc_map = self->aq_nic_cfg->prio_tc_map;
+       u16 action;
        u8 index;
+       int i;
 
+       /* Action Resolver Table (ART) is used by RPF to decide which action
+        * to take with a packet based upon input tag and tag mask, where:
+        *  - input tag is a combination of 3-bit VLan Prio (PTP) and
+        *    29-bit concatenation of all tags from filter block;
+        *  - tag mask is a mask used for matching against input tag.
+        * The input_tag is compared with the all the Requested_tags in the
+        * Record table to find a match. Action field of the selected matched
+        * REC entry is used for further processing. If multiple entries match,
+        * the lowest REC entry, Action field will be selected.
+        */
        hw_atl2_rpf_act_rslvr_section_en_set(self, 0xFFFF);
        hw_atl2_rpfl2_uc_flr_tag_set(self, HW_ATL2_RPF_TAG_BASE_UC,
                                     HW_ATL2_MAC_UC);
        hw_atl2_rpfl2_bc_flr_tag_set(self, HW_ATL2_RPF_TAG_BASE_UC);
 
+       /* FW reserves the beginning of ART, thus all driver entries must
+        * start from the offset specified in FW caps.
+        */
        index = priv->art_base_index + HW_ATL2_RPF_L2_PROMISC_OFF_INDEX;
        hw_atl2_act_rslvr_table_set(self, index, 0,
                                    HW_ATL2_RPF_TAG_UC_MASK |
@@ -220,33 +394,17 @@ static void hw_atl2_hw_init_new_rx_filters(struct aq_hw_s *self)
                                        HW_ATL2_RPF_TAG_UNTAG_MASK,
                                    HW_ATL2_ACTION_DROP);
 
-       index = priv->art_base_index + HW_ATL2_RPF_VLAN_INDEX;
-       hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_VLAN,
-                                   HW_ATL2_RPF_TAG_VLAN_MASK,
-                                   HW_ATL2_ACTION_ASSIGN_TC(0));
-
-       index = priv->art_base_index + HW_ATL2_RPF_MAC_INDEX;
-       hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_UC,
-                                   HW_ATL2_RPF_TAG_UC_MASK,
-                                   HW_ATL2_ACTION_ASSIGN_TC(0));
-
-       index = priv->art_base_index + HW_ATL2_RPF_ALLMC_INDEX;
-       hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_BASE_ALLMC,
-                                   HW_ATL2_RPF_TAG_ALLMC_MASK,
-                                   HW_ATL2_ACTION_ASSIGN_TC(0));
+       /* Configure ART to map given VLan Prio (PCP) to the TC index for
+        * RSS redirection table.
+        */
+       for (i = 0; i < 8; i++) {
+               action = HW_ATL2_ACTION_ASSIGN_TC(prio_tc_map[i]);
 
-       index = priv->art_base_index + HW_ATL2_RPF_UNTAG_INDEX;
-       hw_atl2_act_rslvr_table_set(self, index, HW_ATL2_RPF_TAG_UNTAG_MASK,
-                                   HW_ATL2_RPF_TAG_UNTAG_MASK,
-                                   HW_ATL2_ACTION_ASSIGN_TC(0));
-
-       index = priv->art_base_index + HW_ATL2_RPF_VLAN_PROMISC_ON_INDEX;
-       hw_atl2_act_rslvr_table_set(self, index, 0, HW_ATL2_RPF_TAG_VLAN_MASK,
-                                   HW_ATL2_ACTION_DISABLE);
-
-       index = priv->art_base_index + HW_ATL2_RPF_L2_PROMISC_ON_INDEX;
-       hw_atl2_act_rslvr_table_set(self, index, 0, HW_ATL2_RPF_TAG_UC_MASK,
-                                   HW_ATL2_ACTION_DISABLE);
+               index = priv->art_base_index + HW_ATL2_RPF_PCP_TO_TC_INDEX + i;
+               hw_atl2_act_rslvr_table_set(self, index,
+                                           i << HW_ATL2_RPF_TAG_PCP_OFFSET,
+                                           HW_ATL2_RPF_TAG_PCP_MASK, action);
+       }
 }
 
 static void hw_atl2_hw_new_rx_filter_vlan_promisc(struct aq_hw_s *self,
@@ -309,7 +467,7 @@ static int hw_atl2_hw_init_rx_path(struct aq_hw_s *self)
        int i;
 
        /* Rx TC/RSS number config */
-       hw_atl_rpb_rpf_rx_traf_class_mode_set(self, 1U);
+       hw_atl_rpb_rpf_rx_traf_class_mode_set(self, cfg->tc_mode);
 
        /* Rx flow control */
        hw_atl_rpb_rx_flow_ctl_mode_set(self, 1U);
@@ -317,9 +475,7 @@ static int hw_atl2_hw_init_rx_path(struct aq_hw_s *self)
        hw_atl2_rpf_rss_hash_type_set(self, HW_ATL2_RPF_RSS_HASH_TYPE_ALL);
 
        /* RSS Ring selection */
-       hw_atl_reg_rx_flr_rss_control1set(self, cfg->is_rss ?
-                                               HW_ATL_RSS_ENABLED_3INDEX_BITS :
-                                               HW_ATL_RSS_DISABLED);
+       hw_atl_b0_hw_init_rx_rss_ctrl1(self);
 
        /* Multicast filters */
        for (i = HW_ATL2_MAC_MAX; i--;) {
@@ -678,6 +834,7 @@ const struct aq_hw_ops hw_atl2_ops = {
        .hw_interrupt_moderation_set = hw_atl2_hw_interrupt_moderation_set,
        .hw_rss_set                  = hw_atl2_hw_rss_set,
        .hw_rss_hash_set             = hw_atl_b0_hw_rss_hash_set,
+       .hw_tc_rate_limit_set        = hw_atl2_hw_init_tx_tc_rate_limit,
        .hw_get_hw_stats             = hw_atl2_utils_get_hw_stats,
        .hw_get_fw_version           = hw_atl2_utils_get_fw_version,
        .hw_set_offload              = hw_atl_b0_hw_offload_set,
index e66b3583bfe99df00cd0d2fc3ffe36bd0d4cb406..5a89bb8722f9269eec0d92aed05def908943572f 100644 (file)
@@ -31,7 +31,7 @@
 
 #define HW_ATL2_RSS_REDIRECTION_MAX 64U
 
-#define HW_ATL2_TC_MAX 1U
+#define HW_ATL2_TC_MAX 8U
 #define HW_ATL2_RSS_MAX 8U
 
 #define HW_ATL2_INTR_MODER_MAX  0x1FF
@@ -82,13 +82,6 @@ enum HW_ATL2_RPF_ART_INDEX {
        HW_ATL2_RPF_VLAN_USER_INDEX     = HW_ATL2_RPF_ET_PCP_USER_INDEX + 16,
        HW_ATL2_RPF_PCP_TO_TC_INDEX     = HW_ATL2_RPF_VLAN_USER_INDEX +
                                          HW_ATL_VLAN_MAX_FILTERS,
-       HW_ATL2_RPF_VLAN_INDEX          = HW_ATL2_RPF_PCP_TO_TC_INDEX +
-                                         AQ_CFG_TCS_MAX,
-       HW_ATL2_RPF_MAC_INDEX,
-       HW_ATL2_RPF_ALLMC_INDEX,
-       HW_ATL2_RPF_UNTAG_INDEX,
-       HW_ATL2_RPF_VLAN_PROMISC_ON_INDEX,
-       HW_ATL2_RPF_L2_PROMISC_ON_INDEX,
 };
 
 #define HW_ATL2_ACTION(ACTION, RSS, INDEX, VALID) \
@@ -124,9 +117,6 @@ enum HW_ATL2_RPF_RSS_HASH_TYPE {
                                        HW_ATL2_RPF_RSS_HASH_TYPE_IPV6_EX_UDP,
 };
 
-#define HW_ATL_RSS_DISABLED 0x00000000U
-#define HW_ATL_RSS_ENABLED_3INDEX_BITS 0xB3333333U
-
 #define HW_ATL_MCAST_FLT_ANY_TO_HOST 0x00010FFFU
 
 struct hw_atl2_priv {
index e779d70fde66fd7de6da62964a5f79710baa681d..cd954b11d24abf98599d857f73b2d428e5a8ca3b 100644 (file)
@@ -7,6 +7,14 @@
 #include "hw_atl2_llh_internal.h"
 #include "aq_hw_utils.h"
 
+void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw,
+                                              u32 select)
+{
+       aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_ADR,
+                           HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSK,
+                           HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_SHIFT, select);
+}
+
 void hw_atl2_rpf_rss_hash_type_set(struct aq_hw_s *aq_hw, u32 rss_hash_type)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_PIF_RPF_RSS_HASH_TYPEI_ADR,
@@ -60,6 +68,15 @@ void hw_atl2_rpf_vlan_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter)
 
 /* TX */
 
+void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw,
+                                        const u32 tc_q_rand_map_en)
+{
+       aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_ADR,
+                           HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSK,
+                           HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_SHIFT,
+                           tc_q_rand_map_en);
+}
+
 void hw_atl2_tpb_tx_buf_clk_gate_en_set(struct aq_hw_s *aq_hw, u32 clk_gate_en)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_TX_BUF_CLK_GATE_EN_ADR,
@@ -76,9 +93,18 @@ void hw_atl2_reg_tx_intr_moder_ctrl_set(struct aq_hw_s *aq_hw,
                        tx_intr_moderation_ctl);
 }
 
+void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw,
+                                              const u32 data_arb_mode)
+{
+       aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TC_ARB_MODE_ADR,
+                           HW_ATL2_TPS_DATA_TC_ARB_MODE_MSK,
+                           HW_ATL2_TPS_DATA_TC_ARB_MODE_SHIFT,
+                           data_arb_mode);
+}
+
 void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
-                                                   u32 max_credit,
-                                                   u32 tc)
+                                                   const u32 tc,
+                                                   const u32 max_credit)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TCTCREDIT_MAX_ADR(tc),
                            HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK,
@@ -87,13 +113,13 @@ void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
 }
 
 void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
-                                               u32 tx_pkt_shed_tc_data_weight,
-                                               u32 tc)
+                                               const u32 tc,
+                                               const u32 weight)
 {
        aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPS_DATA_TCTWEIGHT_ADR(tc),
                            HW_ATL2_TPS_DATA_TCTWEIGHT_MSK,
                            HW_ATL2_TPS_DATA_TCTWEIGHT_SHIFT,
-                           tx_pkt_shed_tc_data_weight);
+                           weight);
 }
 
 u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw)
index 8c6d78a64d422fd73410c56c9825c301ec3c682c..98c7a4621297db5efc6f2abb9efc8f87b3df3fc0 100644 (file)
@@ -15,6 +15,10 @@ void hw_atl2_reg_tx_intr_moder_ctrl_set(struct aq_hw_s *aq_hw,
                                        u32 tx_intr_moderation_ctl,
                                        u32 queue);
 
+/* Set Redirection Table 2 Select */
+void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw,
+                                              u32 select);
+
 /** Set RSS HASH type */
 void hw_atl2_rpf_rss_hash_type_set(struct aq_hw_s *aq_hw, u32 rss_hash_type);
 
@@ -34,18 +38,25 @@ void hw_atl2_new_rpf_rss_redir_set(struct aq_hw_s *aq_hw, u32 tc, u32 index,
 /* Set VLAN filter tag */
 void hw_atl2_rpf_vlan_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter);
 
+/* set tx random TC-queue mapping enable bit */
+void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw,
+                                        const u32 tc_q_rand_map_en);
+
 /* set tx buffer clock gate enable */
 void hw_atl2_tpb_tx_buf_clk_gate_en_set(struct aq_hw_s *aq_hw, u32 clk_gate_en);
 
+void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw,
+                                              const u32 data_arb_mode);
+
 /* set tx packet scheduler tc data max credit */
 void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
-                                                   u32 max_credit,
-                                                   u32 tc);
+                                                   const u32 tc,
+                                                   const u32 max_credit);
 
 /* set tx packet scheduler tc data weight */
 void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
-                                               u32 tx_pkt_shed_tc_data_weight,
-                                               u32 tc);
+                                               const u32 tc,
+                                               const u32 weight);
 
 u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw);
 
index cde9e9d2836ddc831865730da524b4ef53d6f20b..e34c5cda061e02cdb334fc69c055722cc04ffa6e 100644 (file)
@@ -6,6 +6,16 @@
 #ifndef HW_ATL2_LLH_INTERNAL_H
 #define HW_ATL2_LLH_INTERNAL_H
 
+/* RX pif_rpf_redir_2_en_i Bitfield Definitions
+ * PORT="pif_rpf_redir_2_en_i"
+ */
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_ADR 0x000054C8
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSK 0x00001000
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_MSKN 0xFFFFEFFF
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_SHIFT 12
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_WIDTH 1
+#define HW_ATL2_RPF_PIF_RPF_REDIR2_ENI_DEFAULT 0x0
+
 /* RX pif_rpf_rss_hash_type_i Bitfield Definitions
  */
 #define HW_ATL2_RPF_PIF_RPF_RSS_HASH_TYPEI_ADR 0x000054C8
 /* Default value of bitfield rx_q{Q}_tc_map[2:0] */
 #define HW_ATL2_RX_Q_TC_MAP_DEFAULT 0x0
 
+/* tx tx_tc_q_rand_map_en bitfield definitions
+ * preprocessor definitions for the bitfield "tx_tc_q_rand_map_en".
+ * port="pif_tpb_tx_tc_q_rand_map_en_i"
+ */
+
+/* register address for bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_ADR 0x00007900
+/* bitmask for bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSK 0x00000200
+/* inverted bitmask for bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_MSKN 0xFFFFFDFF
+/* lower bit position of bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_SHIFT 9
+/* width of bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_WIDTH 1
+/* default value of bitfield tx_tc_q_rand_map_en */
+#define HW_ATL2_TPB_TX_TC_Q_RAND_MAP_EN_DEFAULT 0x0
+
 /* tx tx_buffer_clk_gate_en bitfield definitions
  * preprocessor definitions for the bitfield "tx_buffer_clk_gate_en".
  * port="pif_tpb_tx_buffer_clk_gate_en_i"
 /* default value of bitfield tx_buffer_clk_gate_en */
 #define HW_ATL2_TPB_TX_BUF_CLK_GATE_EN_DEFAULT 0x0
 
-/* tx data_tc{t}_credit_max[b:0] bitfield definitions
- * preprocessor definitions for the bitfield "data_tc{t}_credit_max[b:0]".
+/* tx tx_q_tc_map{q} bitfield definitions
+ * preprocessor definitions for the bitfield "tx_q_tc_map{q}".
+ * parameter: queue {q} | bit-level stride | range [0, 31]
+ * port="pif_tpb_tx_q_tc_map0_i[2:0]"
+ */
+
+/* register address for bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_ADR(queue) \
+       (((queue) < 32) ? 0x0000799C + ((queue) / 4) * 4 : 0)
+/* lower bit position of bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_SHIFT(queue) \
+       (((queue) < 32) ? ((queue) * 8) % 32 : 0)
+/* width of bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_WIDTH 3
+/* default value of bitfield tx_q_tc_map{q} */
+#define HW_ATL2_TX_Q_TC_MAP_DEFAULT 0x0
+
+/* tx data_tc_arb_mode bitfield definitions
+ * preprocessor definitions for the bitfield "data_tc_arb_mode".
+ * port="pif_tps_data_tc_arb_mode_i"
+ */
+
+/* register address for bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_ADR 0x00007100
+/* bitmask for bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_MSK 0x00000003
+/* inverted bitmask for bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_MSKN 0xfffffffc
+/* lower bit position of bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_SHIFT 0
+/* width of bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_WIDTH 2
+/* default value of bitfield data_tc_arb_mode */
+#define HW_ATL2_TPS_DATA_TC_ARB_MODE_DEFAULT 0x0
+
+/* tx data_tc{t}_credit_max[f:0] bitfield definitions
+ * preprocessor definitions for the bitfield "data_tc{t}_credit_max[f:0]".
  * parameter: tc {t} | stride size 0x4 | range [0, 7]
- * port="pif_tps_data_tc0_credit_max_i[11:0]"
+ * port="pif_tps_data_tc0_credit_max_i[15:0]"
  */
 
-/* register address for bitfield data_tc{t}_credit_max[b:0] */
+/* register address for bitfield data_tc{t}_credit_max[f:0] */
 #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_ADR(tc) (0x00007110 + (tc) * 0x4)
-/* bitmask for bitfield data_tc{t}_credit_max[b:0] */
-#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK 0x0fff0000
-/* inverted bitmask for bitfield data_tc{t}_credit_max[b:0] */
-#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSKN 0xf000ffff
-/* lower bit position of bitfield data_tc{t}_credit_max[b:0] */
+/* bitmask for bitfield data_tc{t}_credit_max[f:0] */
+#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSK 0xffff0000
+/* inverted bitmask for bitfield data_tc{t}_credit_max[f:0] */
+#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_MSKN 0x0000ffff
+/* lower bit position of bitfield data_tc{t}_credit_max[f:0] */
 #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_SHIFT 16
-/* width of bitfield data_tc{t}_credit_max[b:0] */
-#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_WIDTH 12
-/* default value of bitfield data_tc{t}_credit_max[b:0] */
+/* width of bitfield data_tc{t}_credit_max[f:0] */
+#define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_WIDTH 16
+/* default value of bitfield data_tc{t}_credit_max[f:0] */
 #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_DEFAULT 0x0
 
-/* tx data_tc{t}_weight[8:0] bitfield definitions
- * preprocessor definitions for the bitfield "data_tc{t}_weight[8:0]".
+/* tx data_tc{t}_weight[e:0] bitfield definitions
+ * preprocessor definitions for the bitfield "data_tc{t}_weight[e:0]".
  * parameter: tc {t} | stride size 0x4 | range [0, 7]
- * port="pif_tps_data_tc0_weight_i[8:0]"
+ * port="pif_tps_data_tc0_weight_i[14:0]"
  */
 
-/* register address for bitfield data_tc{t}_weight[8:0] */
+/* register address for bitfield data_tc{t}_weight[e:0] */
 #define HW_ATL2_TPS_DATA_TCTWEIGHT_ADR(tc) (0x00007110 + (tc) * 0x4)
-/* bitmask for bitfield data_tc{t}_weight[8:0] */
-#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSK 0x000001ff
-/* inverted bitmask for bitfield data_tc{t}_weight[8:0] */
-#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSKN 0xfffffe00
-/* lower bit position of bitfield data_tc{t}_weight[8:0] */
+/* bitmask for bitfield data_tc{t}_weight[e:0] */
+#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSK 0x00007fff
+/* inverted bitmask for bitfield data_tc{t}_weight[e:0] */
+#define HW_ATL2_TPS_DATA_TCTWEIGHT_MSKN 0xffff8000
+/* lower bit position of bitfield data_tc{t}_weight[e:0] */
 #define HW_ATL2_TPS_DATA_TCTWEIGHT_SHIFT 0
-/* width of bitfield data_tc{t}_weight[8:0] */
-#define HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH 9
-/* default value of bitfield data_tc{t}_weight[8:0] */
+/* width of bitfield data_tc{t}_weight[e:0] */
+#define HW_ATL2_TPS_DATA_TCTWEIGHT_WIDTH 15
+/* default value of bitfield data_tc{t}_weight[e:0] */
 #define HW_ATL2_TPS_DATA_TCTWEIGHT_DEFAULT 0x0
 
 /* tx interrupt moderation control register definitions
index fc1405a8ed7484a77fd8003b65610f94b244b61d..5a41801acb6a01b3a9e5763433a2adff9202c248 100644 (file)
@@ -60,6 +60,7 @@
 
 #define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
 extern struct list_head adapter_list;
+extern struct list_head uld_list;
 extern struct mutex uld_mutex;
 
 /* Suspend an Ethernet Tx queue with fewer available descriptors than this.
@@ -822,6 +823,13 @@ struct sge_uld_txq_info {
        u16 ntxq;               /* # of egress uld queues */
 };
 
+/* struct to maintain ULD list to reallocate ULD resources on hotplug */
+struct cxgb4_uld_list {
+       struct cxgb4_uld_info uld_info;
+       struct list_head list_node;
+       enum cxgb4_uld uld_type;
+};
+
 enum sge_eosw_state {
        CXGB4_EO_STATE_CLOSED = 0, /* Not ready to accept traffic */
        CXGB4_EO_STATE_FLOWC_OPEN_SEND, /* Send FLOWC open request */
index d05c2371d8c7e7f2e249f719e8ea4b5ab0b20328..7a0414f379be48cd472ec7bb13c3df77fc1fc233 100644 (file)
@@ -180,6 +180,7 @@ static struct dentry *cxgb4_debugfs_root;
 
 LIST_HEAD(adapter_list);
 DEFINE_MUTEX(uld_mutex);
+LIST_HEAD(uld_list);
 
 static int cfg_queues(struct adapter *adap);
 
@@ -6519,11 +6520,8 @@ fw_attach_fail:
        /* PCIe EEH recovery on powerpc platforms needs fundamental reset */
        pdev->needs_freset = 1;
 
-       if (is_uld(adapter)) {
-               mutex_lock(&uld_mutex);
-               list_add_tail(&adapter->list_node, &adapter_list);
-               mutex_unlock(&uld_mutex);
-       }
+       if (is_uld(adapter))
+               cxgb4_uld_enable(adapter);
 
        if (!is_t4(adapter->params.chip))
                cxgb4_ptp_init(adapter);
index e65b52375dd8a3dcf895e43b701696a0e401ef8c..6b1d3df4b9bae6d9718eb28ef313d7a11e70cc5e 100644 (file)
@@ -681,6 +681,74 @@ static void cxgb4_set_ktls_feature(struct adapter *adap, bool enable)
 }
 #endif
 
+static void cxgb4_uld_alloc_resources(struct adapter *adap,
+                                     enum cxgb4_uld type,
+                                     const struct cxgb4_uld_info *p)
+{
+       int ret = 0;
+
+       if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) ||
+           (type != CXGB4_ULD_CRYPTO && !is_offload(adap)))
+               return;
+       if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip))
+               return;
+       ret = cfg_queues_uld(adap, type, p);
+       if (ret)
+               goto out;
+       ret = setup_sge_queues_uld(adap, type, p->lro);
+       if (ret)
+               goto free_queues;
+       if (adap->flags & CXGB4_USING_MSIX) {
+               ret = request_msix_queue_irqs_uld(adap, type);
+               if (ret)
+                       goto free_rxq;
+       }
+       if (adap->flags & CXGB4_FULL_INIT_DONE)
+               enable_rx_uld(adap, type);
+#ifdef CONFIG_CHELSIO_TLS_DEVICE
+       /* send mbox to enable ktls related settings. */
+       if (type == CXGB4_ULD_CRYPTO &&
+           (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW))
+               cxgb4_set_ktls_feature(adap, 1);
+#endif
+       if (adap->uld[type].add)
+               goto free_irq;
+       ret = setup_sge_txq_uld(adap, type, p);
+       if (ret)
+               goto free_irq;
+       adap->uld[type] = *p;
+       ret = uld_attach(adap, type);
+       if (ret)
+               goto free_txq;
+       return;
+free_txq:
+       release_sge_txq_uld(adap, type);
+free_irq:
+       if (adap->flags & CXGB4_FULL_INIT_DONE)
+               quiesce_rx_uld(adap, type);
+       if (adap->flags & CXGB4_USING_MSIX)
+               free_msix_queue_irqs_uld(adap, type);
+free_rxq:
+       free_sge_queues_uld(adap, type);
+free_queues:
+       free_queues_uld(adap, type);
+out:
+       dev_warn(adap->pdev_dev,
+                "ULD registration failed for uld type %d\n", type);
+}
+
+void cxgb4_uld_enable(struct adapter *adap)
+{
+       struct cxgb4_uld_list *uld_entry;
+
+       mutex_lock(&uld_mutex);
+       list_add_tail(&adap->list_node, &adapter_list);
+       list_for_each_entry(uld_entry, &uld_list, list_node)
+               cxgb4_uld_alloc_resources(adap, uld_entry->uld_type,
+                                         &uld_entry->uld_info);
+       mutex_unlock(&uld_mutex);
+}
+
 /* cxgb4_register_uld - register an upper-layer driver
  * @type: the ULD type
  * @p: the ULD methods
@@ -691,63 +759,23 @@ static void cxgb4_set_ktls_feature(struct adapter *adap, bool enable)
 void cxgb4_register_uld(enum cxgb4_uld type,
                        const struct cxgb4_uld_info *p)
 {
+       struct cxgb4_uld_list *uld_entry;
        struct adapter *adap;
-       int ret = 0;
 
        if (type >= CXGB4_ULD_MAX)
                return;
 
+       uld_entry = kzalloc(sizeof(*uld_entry), GFP_KERNEL);
+       if (!uld_entry)
+               return;
+
+       memcpy(&uld_entry->uld_info, p, sizeof(struct cxgb4_uld_info));
        mutex_lock(&uld_mutex);
-       list_for_each_entry(adap, &adapter_list, list_node) {
-               if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) ||
-                   (type != CXGB4_ULD_CRYPTO && !is_offload(adap)))
-                       continue;
-               if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip))
-                       continue;
-               ret = cfg_queues_uld(adap, type, p);
-               if (ret)
-                       goto out;
-               ret = setup_sge_queues_uld(adap, type, p->lro);
-               if (ret)
-                       goto free_queues;
-               if (adap->flags & CXGB4_USING_MSIX) {
-                       ret = request_msix_queue_irqs_uld(adap, type);
-                       if (ret)
-                               goto free_rxq;
-               }
-               if (adap->flags & CXGB4_FULL_INIT_DONE)
-                       enable_rx_uld(adap, type);
-#ifdef CONFIG_CHELSIO_TLS_DEVICE
-               /* send mbox to enable ktls related settings. */
-               if (type == CXGB4_ULD_CRYPTO &&
-                   (adap->params.crypto & FW_CAPS_CONFIG_TX_TLS_HW))
-                       cxgb4_set_ktls_feature(adap, 1);
-#endif
-               if (adap->uld[type].add)
-                       goto free_irq;
-               ret = setup_sge_txq_uld(adap, type, p);
-               if (ret)
-                       goto free_irq;
-               adap->uld[type] = *p;
-               ret = uld_attach(adap, type);
-               if (ret)
-                       goto free_txq;
-               continue;
-free_txq:
-               release_sge_txq_uld(adap, type);
-free_irq:
-               if (adap->flags & CXGB4_FULL_INIT_DONE)
-                       quiesce_rx_uld(adap, type);
-               if (adap->flags & CXGB4_USING_MSIX)
-                       free_msix_queue_irqs_uld(adap, type);
-free_rxq:
-               free_sge_queues_uld(adap, type);
-free_queues:
-               free_queues_uld(adap, type);
-out:
-               dev_warn(adap->pdev_dev,
-                        "ULD registration failed for uld type %d\n", type);
-       }
+       list_for_each_entry(adap, &adapter_list, list_node)
+               cxgb4_uld_alloc_resources(adap, type, p);
+
+       uld_entry->uld_type = type;
+       list_add_tail(&uld_entry->list_node, &uld_list);
        mutex_unlock(&uld_mutex);
        return;
 }
@@ -761,6 +789,7 @@ EXPORT_SYMBOL(cxgb4_register_uld);
  */
 int cxgb4_unregister_uld(enum cxgb4_uld type)
 {
+       struct cxgb4_uld_list *uld_entry, *tmp;
        struct adapter *adap;
 
        if (type >= CXGB4_ULD_MAX)
@@ -783,6 +812,13 @@ int cxgb4_unregister_uld(enum cxgb4_uld type)
                        cxgb4_set_ktls_feature(adap, 0);
 #endif
        }
+
+       list_for_each_entry_safe(uld_entry, tmp, &uld_list, list_node) {
+               if (uld_entry->uld_type == type) {
+                       list_del(&uld_entry->list_node);
+                       kfree(uld_entry);
+               }
+       }
        mutex_unlock(&uld_mutex);
 
        return 0;
index 16796785eea32297a5a274b1e178dea4188944ce..085fa1424f9af179d123f6255848e935dfc9b83c 100644 (file)
@@ -327,6 +327,7 @@ enum cxgb4_control {
        CXGB4_CONTROL_DB_DROP,
 };
 
+struct adapter;
 struct pci_dev;
 struct l2t_data;
 struct net_device;
@@ -465,6 +466,7 @@ struct cxgb4_uld_info {
        int (*tx_handler)(struct sk_buff *skb, struct net_device *dev);
 };
 
+void cxgb4_uld_enable(struct adapter *adap);
 void cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p);
 int cxgb4_unregister_uld(enum cxgb4_uld type);
 int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb);
index 05bc6e216bca24e15bbd615dd020a22ae96b66e3..d9fa4600f7455d956c80a059ea76e6facb28ad82 100644 (file)
@@ -542,8 +542,13 @@ void e1000_reinit_locked(struct e1000_adapter *adapter)
        WARN_ON(in_interrupt());
        while (test_and_set_bit(__E1000_RESETTING, &adapter->flags))
                msleep(1);
-       e1000_down(adapter);
-       e1000_up(adapter);
+
+       /* only run the task if not already down */
+       if (!test_bit(__E1000_DOWN, &adapter->flags)) {
+               e1000_down(adapter);
+               e1000_up(adapter);
+       }
+
        clear_bit(__E1000_RESETTING, &adapter->flags);
 }
 
@@ -1433,10 +1438,15 @@ int e1000_close(struct net_device *netdev)
        struct e1000_hw *hw = &adapter->hw;
        int count = E1000_CHECK_RESET_COUNT;
 
-       while (test_bit(__E1000_RESETTING, &adapter->flags) && count--)
+       while (test_and_set_bit(__E1000_RESETTING, &adapter->flags) && count--)
                usleep_range(10000, 20000);
 
-       WARN_ON(test_bit(__E1000_RESETTING, &adapter->flags));
+       WARN_ON(count < 0);
+
+       /* signal that we're down so that the reset task will no longer run */
+       set_bit(__E1000_DOWN, &adapter->flags);
+       clear_bit(__E1000_RESETTING, &adapter->flags);
+
        e1000_down(adapter);
        e1000_power_down_phy(adapter);
        e1000_free_irq(adapter);
index 735bf25952fc7d91e23cc81cede03e99477e9f65..f999cca37a8ab7de5ab931dd7db114755400713a 100644 (file)
@@ -300,7 +300,11 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw)
         * so forcibly disable it.
         */
        hw->dev_spec.ich8lan.ulp_state = e1000_ulp_state_unknown;
-       e1000_disable_ulp_lpt_lp(hw, true);
+       ret_val = e1000_disable_ulp_lpt_lp(hw, true);
+       if (ret_val) {
+               e_warn("Failed to disable ULP\n");
+               goto out;
+       }
 
        ret_val = hw->phy.ops.acquire(hw);
        if (ret_val) {
index e0b074820b473eb464a5a112cae004c68ca34f7d..32f23a15ff64579c0351b10533d671e9ff4605c6 100644 (file)
@@ -107,6 +107,45 @@ static const struct e1000_reg_info e1000_reg_info_tbl[] = {
        {0, NULL}
 };
 
+struct e1000e_me_supported {
+       u16 device_id;          /* supported device ID */
+};
+
+static const struct e1000e_me_supported me_supported[] = {
+       {E1000_DEV_ID_PCH_LPT_I217_LM},
+       {E1000_DEV_ID_PCH_LPTLP_I218_LM},
+       {E1000_DEV_ID_PCH_I218_LM2},
+       {E1000_DEV_ID_PCH_I218_LM3},
+       {E1000_DEV_ID_PCH_SPT_I219_LM},
+       {E1000_DEV_ID_PCH_SPT_I219_LM2},
+       {E1000_DEV_ID_PCH_LBG_I219_LM3},
+       {E1000_DEV_ID_PCH_SPT_I219_LM4},
+       {E1000_DEV_ID_PCH_SPT_I219_LM5},
+       {E1000_DEV_ID_PCH_CNP_I219_LM6},
+       {E1000_DEV_ID_PCH_CNP_I219_LM7},
+       {E1000_DEV_ID_PCH_ICP_I219_LM8},
+       {E1000_DEV_ID_PCH_ICP_I219_LM9},
+       {E1000_DEV_ID_PCH_CMP_I219_LM10},
+       {E1000_DEV_ID_PCH_CMP_I219_LM11},
+       {E1000_DEV_ID_PCH_CMP_I219_LM12},
+       {E1000_DEV_ID_PCH_TGP_I219_LM13},
+       {E1000_DEV_ID_PCH_TGP_I219_LM14},
+       {E1000_DEV_ID_PCH_TGP_I219_LM15},
+       {0}
+};
+
+static bool e1000e_check_me(u16 device_id)
+{
+       struct e1000e_me_supported *id;
+
+       for (id = (struct e1000e_me_supported *)me_supported;
+            id->device_id; id++)
+               if (device_id == id->device_id)
+                       return true;
+
+       return false;
+}
+
 /**
  * __ew32_prepare - prepare to write to MAC CSR register on certain parts
  * @hw: pointer to the HW structure
@@ -5294,6 +5333,10 @@ static void e1000_watchdog_task(struct work_struct *work)
                                        /* oops */
                                        break;
                                }
+                               if (hw->mac.type == e1000_pch_spt) {
+                                       netdev->features &= ~NETIF_F_TSO;
+                                       netdev->features &= ~NETIF_F_TSO6;
+                               }
                        }
 
                        /* enable transmits in the hardware, need to do this
@@ -6912,7 +6955,8 @@ static int e1000e_pm_suspend(struct device *dev)
                e1000e_pm_thaw(dev);
 
        /* Introduce S0ix implementation */
-       if (hw->mac.type >= e1000_pch_cnp)
+       if (hw->mac.type >= e1000_pch_cnp &&
+           !e1000e_check_me(hw->adapter->pdev->device))
                e1000e_s0ix_entry_flow(adapter);
 
        return rc;
@@ -6927,7 +6971,8 @@ static int e1000e_pm_resume(struct device *dev)
        int rc;
 
        /* Introduce S0ix implementation */
-       if (hw->mac.type >= e1000_pch_cnp)
+       if (hw->mac.type >= e1000_pch_cnp &&
+           !e1000e_check_me(hw->adapter->pdev->device))
                e1000e_s0ix_exit_flow(adapter);
 
        rc = __e1000_resume(pdev);
index 2a037ec244b945aa04f7f263f77177c069b6ce6a..ea7395b391e508d6a73f035f30efec9e2c65b531 100644 (file)
@@ -11,7 +11,7 @@
 #include "i40e_diag.h"
 #include "i40e_xsk.h"
 #include <net/udp_tunnel.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 /* All i40e tracepoints are defined by the include below, which
  * must be included exactly once across the whole kernel with
  * CREATE_TRACE_POINTS defined
@@ -3260,26 +3260,31 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
        if (ring->vsi->type == I40E_VSI_MAIN)
                xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
 
+       kfree(ring->rx_bi);
        ring->xsk_umem = i40e_xsk_umem(ring);
        if (ring->xsk_umem) {
-               ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
-                                  XDP_PACKET_HEADROOM;
+               ret = i40e_alloc_rx_bi_zc(ring);
+               if (ret)
+                       return ret;
+               ring->rx_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
                /* For AF_XDP ZC, we disallow packets to span on
                 * multiple buffers, thus letting us skip that
                 * handling in the fast-path.
                 */
                chain_len = 1;
-               ring->zca.free = i40e_zca_free;
                ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
-                                                MEM_TYPE_ZERO_COPY,
-                                                &ring->zca);
+                                                MEM_TYPE_XSK_BUFF_POOL,
+                                                NULL);
                if (ret)
                        return ret;
                dev_info(&vsi->back->pdev->dev,
-                        "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+                        "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
                         ring->queue_index);
 
        } else {
+               ret = i40e_alloc_rx_bi(ring);
+               if (ret)
+                       return ret;
                ring->rx_buf_len = vsi->rx_buf_len;
                if (ring->vsi->type == I40E_VSI_MAIN) {
                        ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
@@ -3344,9 +3349,12 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
        ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
        writel(0, ring->tail);
 
-       ok = ring->xsk_umem ?
-            i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) :
-            !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+       if (ring->xsk_umem) {
+               xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
+               ok = i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring));
+       } else {
+               ok = !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+       }
        if (!ok) {
                /* Log this in case the user has forgotten to give the kernel
                 * any buffers, even later in the application.
index a3772beffe023c65682a48e0c840fef410664992..f613782f2f56f8e07377607c5cdc58a13bcb549b 100644 (file)
@@ -521,28 +521,29 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
 /**
  * i40e_fd_handle_status - check the Programming Status for FD
  * @rx_ring: the Rx ring for this descriptor
- * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
+ * @qword0_raw: qword0
+ * @qword1: qword1 after le_to_cpu
  * @prog_id: the id originally used for programming
  *
  * This is used to verify if the FD programming or invalidation
  * requested by SW to the HW is successful or not and take actions accordingly.
  **/
-void i40e_fd_handle_status(struct i40e_ring *rx_ring,
-                          union i40e_rx_desc *rx_desc, u8 prog_id)
+static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+                                 u64 qword1, u8 prog_id)
 {
        struct i40e_pf *pf = rx_ring->vsi->back;
        struct pci_dev *pdev = pf->pdev;
+       struct i40e_32b_rx_wb_qw0 *qw0;
        u32 fcnt_prog, fcnt_avail;
        u32 error;
-       u64 qw;
 
-       qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-       error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
+       qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw;
+       error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
                I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
 
        if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
-               pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
-               if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
+               pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id);
+               if (qw0->hi_dword.fd_id != 0 ||
                    (I40E_DEBUG_FD & pf->hw.debug_mask))
                        dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
                                 pf->fd_inv);
@@ -560,7 +561,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
                /* store the current atr filter count */
                pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
 
-               if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
+               if (qw0->hi_dword.fd_id == 0 &&
                    test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) {
                        /* These set_bit() calls aren't atomic with the
                         * test_bit() here, but worse case we potentially
@@ -589,7 +590,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
        } else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
                if (I40E_DEBUG_FD & pf->hw.debug_mask)
                        dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
-                                rx_desc->wb.qword0.hi_dword.fd_id);
+                                qw0->hi_dword.fd_id);
        }
 }
 
@@ -1195,6 +1196,11 @@ clear_counts:
        rc->total_packets = 0;
 }
 
+static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
+{
+       return &rx_ring->rx_bi[idx];
+}
+
 /**
  * i40e_reuse_rx_page - page flip buffer and store it back on the ring
  * @rx_ring: rx descriptor ring to store buffers on
@@ -1208,7 +1214,7 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
        struct i40e_rx_buffer *new_buff;
        u16 nta = rx_ring->next_to_alloc;
 
-       new_buff = &rx_ring->rx_bi[nta];
+       new_buff = i40e_rx_bi(rx_ring, nta);
 
        /* update, and store next to alloc */
        nta++;
@@ -1227,29 +1233,10 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
 }
 
 /**
- * i40e_rx_is_programming_status - check for programming status descriptor
- * @qw: qword representing status_error_len in CPU ordering
- *
- * The value of in the descriptor length field indicate if this
- * is a programming status descriptor for flow director or FCoE
- * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
- * it is a packet descriptor.
- **/
-static inline bool i40e_rx_is_programming_status(u64 qw)
-{
-       /* The Rx filter programming status and SPH bit occupy the same
-        * spot in the descriptor. Since we don't support packet split we
-        * can just reuse the bit as an indication that this is a
-        * programming status descriptor.
-        */
-       return qw & I40E_RXD_QW1_LENGTH_SPH_MASK;
-}
-
-/**
- * i40e_clean_programming_status - try clean the programming status descriptor
+ * i40e_clean_programming_status - clean the programming status descriptor
  * @rx_ring: the rx ring that has this descriptor
- * @rx_desc: the rx descriptor written back by HW
- * @qw: qword representing status_error_len in CPU ordering
+ * @qword0_raw: qword0
+ * @qword1: qword1 representing status_error_len in CPU ordering
  *
  * Flow director should handle FD_FILTER_STATUS to check its filter programming
  * status being successful or not and take actions accordingly. FCoE should
@@ -1257,34 +1244,16 @@ static inline bool i40e_rx_is_programming_status(u64 qw)
  *
  * Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL.
  **/
-struct i40e_rx_buffer *i40e_clean_programming_status(
-       struct i40e_ring *rx_ring,
-       union i40e_rx_desc *rx_desc,
-       u64 qw)
+void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+                                  u64 qword1)
 {
-       struct i40e_rx_buffer *rx_buffer;
-       u32 ntc;
        u8 id;
 
-       if (!i40e_rx_is_programming_status(qw))
-               return NULL;
-
-       ntc = rx_ring->next_to_clean;
-
-       /* fetch, update, and store next to clean */
-       rx_buffer = &rx_ring->rx_bi[ntc++];
-       ntc = (ntc < rx_ring->count) ? ntc : 0;
-       rx_ring->next_to_clean = ntc;
-
-       prefetch(I40E_RX_DESC(rx_ring, ntc));
-
-       id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
+       id = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
                  I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
 
        if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
-               i40e_fd_handle_status(rx_ring, rx_desc, id);
-
-       return rx_buffer;
+               i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id);
 }
 
 /**
@@ -1336,13 +1305,25 @@ err:
        return -ENOMEM;
 }
 
+int i40e_alloc_rx_bi(struct i40e_ring *rx_ring)
+{
+       unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count;
+
+       rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL);
+       return rx_ring->rx_bi ? 0 : -ENOMEM;
+}
+
+static void i40e_clear_rx_bi(struct i40e_ring *rx_ring)
+{
+       memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count);
+}
+
 /**
  * i40e_clean_rx_ring - Free Rx buffers
  * @rx_ring: ring to be cleaned
  **/
 void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 {
-       unsigned long bi_size;
        u16 i;
 
        /* ring already cleared, nothing to do */
@@ -1361,7 +1342,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 
        /* Free all the Rx ring sk_buffs */
        for (i = 0; i < rx_ring->count; i++) {
-               struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+               struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i);
 
                if (!rx_bi->page)
                        continue;
@@ -1388,8 +1369,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
        }
 
 skip_free:
-       bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
-       memset(rx_ring->rx_bi, 0, bi_size);
+       if (rx_ring->xsk_umem)
+               i40e_clear_rx_bi_zc(rx_ring);
+       else
+               i40e_clear_rx_bi(rx_ring);
 
        /* Zero out the descriptor ring */
        memset(rx_ring->desc, 0, rx_ring->size);
@@ -1430,15 +1413,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
 int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
 {
        struct device *dev = rx_ring->dev;
-       int err = -ENOMEM;
-       int bi_size;
-
-       /* warn if we are about to overwrite the pointer */
-       WARN_ON(rx_ring->rx_bi);
-       bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
-       rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
-       if (!rx_ring->rx_bi)
-               goto err;
+       int err;
 
        u64_stats_init(&rx_ring->syncp);
 
@@ -1451,7 +1426,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
        if (!rx_ring->desc) {
                dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
                         rx_ring->size);
-               goto err;
+               return -ENOMEM;
        }
 
        rx_ring->next_to_alloc = 0;
@@ -1463,16 +1438,12 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
                err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
                                       rx_ring->queue_index);
                if (err < 0)
-                       goto err;
+                       return err;
        }
 
        rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
 
        return 0;
-err:
-       kfree(rx_ring->rx_bi);
-       rx_ring->rx_bi = NULL;
-       return err;
 }
 
 /**
@@ -1592,7 +1563,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
                return false;
 
        rx_desc = I40E_RX_DESC(rx_ring, ntu);
-       bi = &rx_ring->rx_bi[ntu];
+       bi = i40e_rx_bi(rx_ring, ntu);
 
        do {
                if (!i40e_alloc_mapped_page(rx_ring, bi))
@@ -1614,7 +1585,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
                ntu++;
                if (unlikely(ntu == rx_ring->count)) {
                        rx_desc = I40E_RX_DESC(rx_ring, 0);
-                       bi = rx_ring->rx_bi;
+                       bi = i40e_rx_bi(rx_ring, 0);
                        ntu = 0;
                }
 
@@ -1981,7 +1952,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
 {
        struct i40e_rx_buffer *rx_buffer;
 
-       rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
+       rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
        prefetchw(rx_buffer->page);
 
        /* we are reusing so sync this buffer for CPU use */
@@ -2382,9 +2353,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
                 */
                dma_rmb();
 
-               rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc,
-                                                         qword);
-               if (unlikely(rx_buffer)) {
+               if (i40e_rx_is_programming_status(qword)) {
+                       i40e_clean_programming_status(rx_ring,
+                                                     rx_desc->raw.qword[0],
+                                                     qword);
+                       rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+                       i40e_inc_ntc(rx_ring);
                        i40e_reuse_rx_page(rx_ring, rx_buffer);
                        cleaned_count++;
                        continue;
index 36d37f31a287e2ae5309b7c9b545e552037395f1..5c255977fd5827b9362d7f14280f31dc297a4434 100644 (file)
@@ -296,17 +296,9 @@ struct i40e_tx_buffer {
 
 struct i40e_rx_buffer {
        dma_addr_t dma;
-       union {
-               struct {
-                       struct page *page;
-                       __u32 page_offset;
-                       __u16 pagecnt_bias;
-               };
-               struct {
-                       void *addr;
-                       u64 handle;
-               };
-       };
+       struct page *page;
+       __u32 page_offset;
+       __u16 pagecnt_bias;
 };
 
 struct i40e_queue_stats {
@@ -358,6 +350,7 @@ struct i40e_ring {
        union {
                struct i40e_tx_buffer *tx_bi;
                struct i40e_rx_buffer *rx_bi;
+               struct xdp_buff **rx_bi_zc;
        };
        DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
        u16 queue_index;                /* Queue number of ring */
@@ -419,7 +412,6 @@ struct i40e_ring {
        struct i40e_channel *ch;
        struct xdp_rxq_info xdp_rxq;
        struct xdp_umem *xsk_umem;
-       struct zero_copy_allocator zca; /* ZC allocator anchor */
 } ____cacheline_internodealigned_in_smp;
 
 static inline bool ring_uses_build_skb(struct i40e_ring *ring)
@@ -495,6 +487,7 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
 int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
                  u32 flags);
+int i40e_alloc_rx_bi(struct i40e_ring *rx_ring);
 
 /**
  * i40e_get_head - Retrieve head from head writeback
index 8af0e99c6c0d89b9e9aa7f59b3c6eaaa8ccdb7fb..667c4dc4b39f79193e0fe99bfbc6d7a3daf6babc 100644 (file)
@@ -4,13 +4,9 @@
 #ifndef I40E_TXRX_COMMON_
 #define I40E_TXRX_COMMON_
 
-void i40e_fd_handle_status(struct i40e_ring *rx_ring,
-                          union i40e_rx_desc *rx_desc, u8 prog_id);
 int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring);
-struct i40e_rx_buffer *i40e_clean_programming_status(
-       struct i40e_ring *rx_ring,
-       union i40e_rx_desc *rx_desc,
-       u64 qw);
+void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
+                                  u64 qword1);
 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
                             union i40e_rx_desc *rx_desc, struct sk_buff *skb);
 void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring);
@@ -84,6 +80,38 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
        }
 }
 
+/**
+ * i40e_rx_is_programming_status - check for programming status descriptor
+ * @qword1: qword1 representing status_error_len in CPU ordering
+ *
+ * The value of in the descriptor length field indicate if this
+ * is a programming status descriptor for flow director or FCoE
+ * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
+ * it is a packet descriptor.
+ **/
+static inline bool i40e_rx_is_programming_status(u64 qword1)
+{
+       /* The Rx filter programming status and SPH bit occupy the same
+        * spot in the descriptor. Since we don't support packet split we
+        * can just reuse the bit as an indication that this is a
+        * programming status descriptor.
+        */
+       return qword1 & I40E_RXD_QW1_LENGTH_SPH_MASK;
+}
+
+/**
+ * i40e_inc_ntc: Advance the next_to_clean index
+ * @rx_ring: Rx ring
+ **/
+static inline void i40e_inc_ntc(struct i40e_ring *rx_ring)
+{
+       u32 ntc = rx_ring->next_to_clean + 1;
+
+       ntc = (ntc < rx_ring->count) ? ntc : 0;
+       rx_ring->next_to_clean = ntc;
+       prefetch(I40E_RX_DESC(rx_ring, ntc));
+}
+
 void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
 void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
 bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);
index 6ea2867ff60fbedf52c5e3fd76109fb87a28f3e4..63e098f7cb63b323bfa06bb6367a57a6090435a2 100644 (file)
@@ -689,7 +689,7 @@ union i40e_32byte_rx_desc {
                __le64  rsvd2;
        } read;
        struct {
-               struct {
+               struct i40e_32b_rx_wb_qw0 {
                        struct {
                                union {
                                        __le16 mirroring_status;
@@ -727,6 +727,9 @@ union i40e_32byte_rx_desc {
                        } hi_dword;
                } qword3;
        } wb;  /* writeback */
+       struct {
+               u64 qword[4];
+       } raw;
 };
 
 enum i40e_rx_desc_status_bits {
index 2b9184aead5f62e2e828e1d16d4e55a673c397c8..f3953744c5053b0b322495a497eaf76ff5d59360 100644 (file)
@@ -2,68 +2,30 @@
 /* Copyright(c) 2018 Intel Corporation. */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 
 #include "i40e.h"
 #include "i40e_txrx_common.h"
 #include "i40e_xsk.h"
 
-/**
- * i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev
- * @vsi: Current VSI
- * @umem: UMEM to DMA map
- *
- * Returns 0 on success, <0 on failure
- **/
-static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem)
+int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring)
 {
-       struct i40e_pf *pf = vsi->back;
-       struct device *dev;
-       unsigned int i, j;
-       dma_addr_t dma;
-
-       dev = &pf->pdev->dev;
-       for (i = 0; i < umem->npgs; i++) {
-               dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
-                                        DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
-               if (dma_mapping_error(dev, dma))
-                       goto out_unmap;
+       unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count;
 
-               umem->pages[i].dma = dma;
-       }
-
-       return 0;
-
-out_unmap:
-       for (j = 0; j < i; j++) {
-               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
-                                    DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
-               umem->pages[i].dma = 0;
-       }
-
-       return -1;
+       rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL);
+       return rx_ring->rx_bi_zc ? 0 : -ENOMEM;
 }
 
-/**
- * i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev
- * @vsi: Current VSI
- * @umem: UMEM to DMA map
- **/
-static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem)
+void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring)
 {
-       struct i40e_pf *pf = vsi->back;
-       struct device *dev;
-       unsigned int i;
-
-       dev = &pf->pdev->dev;
-
-       for (i = 0; i < umem->npgs; i++) {
-               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
-                                    DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
+       memset(rx_ring->rx_bi_zc, 0,
+              sizeof(*rx_ring->rx_bi_zc) * rx_ring->count);
+}
 
-               umem->pages[i].dma = 0;
-       }
+static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
+{
+       return &rx_ring->rx_bi_zc[idx];
 }
 
 /**
@@ -78,7 +40,6 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
                                u16 qid)
 {
        struct net_device *netdev = vsi->netdev;
-       struct xdp_umem_fq_reuse *reuseq;
        bool if_running;
        int err;
 
@@ -92,13 +53,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
            qid >= netdev->real_num_tx_queues)
                return -EINVAL;
 
-       reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
-       if (!reuseq)
-               return -ENOMEM;
-
-       xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
-       err = i40e_xsk_umem_dma_map(vsi, umem);
+       err = xsk_buff_dma_map(umem, &vsi->back->pdev->dev, I40E_RX_DMA_ATTR);
        if (err)
                return err;
 
@@ -151,7 +106,7 @@ static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid)
        }
 
        clear_bit(qid, vsi->af_xdp_zc_qps);
-       i40e_xsk_umem_dma_unmap(vsi, umem);
+       xsk_buff_dma_unmap(umem, I40E_RX_DMA_ATTR);
 
        if (if_running) {
                err = i40e_queue_pair_enable(vsi, qid);
@@ -190,11 +145,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
  **/
 static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
 {
-       struct xdp_umem *umem = rx_ring->xsk_umem;
        int err, result = I40E_XDP_PASS;
        struct i40e_ring *xdp_ring;
        struct bpf_prog *xdp_prog;
-       u64 offset;
        u32 act;
 
        rcu_read_lock();
@@ -203,9 +156,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
         */
        xdp_prog = READ_ONCE(rx_ring->xdp_prog);
        act = bpf_prog_run_xdp(xdp_prog, xdp);
-       offset = xdp->data - xdp->data_hard_start;
-
-       xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
 
        switch (act) {
        case XDP_PASS:
@@ -232,107 +182,26 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
        return result;
 }
 
-/**
- * i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer
- * @rx_ring: Rx ring
- * @bi: Rx buffer to populate
- *
- * This function allocates an Rx buffer. The buffer can come from fill
- * queue, or via the recycle queue (next_to_alloc).
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
-                                struct i40e_rx_buffer *bi)
-{
-       struct xdp_umem *umem = rx_ring->xsk_umem;
-       void *addr = bi->addr;
-       u64 handle, hr;
-
-       if (addr) {
-               rx_ring->rx_stats.page_reuse_count++;
-               return true;
-       }
-
-       if (!xsk_umem_peek_addr(umem, &handle)) {
-               rx_ring->rx_stats.alloc_page_failed++;
-               return false;
-       }
-
-       hr = umem->headroom + XDP_PACKET_HEADROOM;
-
-       bi->dma = xdp_umem_get_dma(umem, handle);
-       bi->dma += hr;
-
-       bi->addr = xdp_umem_get_data(umem, handle);
-       bi->addr += hr;
-
-       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
-       xsk_umem_release_addr(umem);
-       return true;
-}
-
-/**
- * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer
- * @rx_ring: Rx ring
- * @bi: Rx buffer to populate
- *
- * This function allocates an Rx buffer. The buffer can come from fill
- * queue, or via the reuse queue.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
-                                     struct i40e_rx_buffer *bi)
-{
-       struct xdp_umem *umem = rx_ring->xsk_umem;
-       u64 handle, hr;
-
-       if (!xsk_umem_peek_addr_rq(umem, &handle)) {
-               rx_ring->rx_stats.alloc_page_failed++;
-               return false;
-       }
-
-       handle &= rx_ring->xsk_umem->chunk_mask;
-
-       hr = umem->headroom + XDP_PACKET_HEADROOM;
-
-       bi->dma = xdp_umem_get_dma(umem, handle);
-       bi->dma += hr;
-
-       bi->addr = xdp_umem_get_data(umem, handle);
-       bi->addr += hr;
-
-       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
-       xsk_umem_release_addr_rq(umem);
-       return true;
-}
-
-static __always_inline bool
-__i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count,
-                          bool alloc(struct i40e_ring *rx_ring,
-                                     struct i40e_rx_buffer *bi))
+bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
 {
        u16 ntu = rx_ring->next_to_use;
        union i40e_rx_desc *rx_desc;
-       struct i40e_rx_buffer *bi;
+       struct xdp_buff **bi, *xdp;
+       dma_addr_t dma;
        bool ok = true;
 
        rx_desc = I40E_RX_DESC(rx_ring, ntu);
-       bi = &rx_ring->rx_bi[ntu];
+       bi = i40e_rx_bi(rx_ring, ntu);
        do {
-               if (!alloc(rx_ring, bi)) {
+               xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+               if (!xdp) {
                        ok = false;
                        goto no_buffers;
                }
-
-               dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0,
-                                                rx_ring->rx_buf_len,
-                                                DMA_BIDIRECTIONAL);
-
-               rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+               *bi = xdp;
+               dma = xsk_buff_xdp_get_dma(xdp);
+               rx_desc->read.pkt_addr = cpu_to_le64(dma);
+               rx_desc->read.hdr_addr = 0;
 
                rx_desc++;
                bi++;
@@ -340,11 +209,10 @@ __i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count,
 
                if (unlikely(ntu == rx_ring->count)) {
                        rx_desc = I40E_RX_DESC(rx_ring, 0);
-                       bi = rx_ring->rx_bi;
+                       bi = i40e_rx_bi(rx_ring, 0);
                        ntu = 0;
                }
 
-               rx_desc->wb.qword1.status_error_len = 0;
                count--;
        } while (count);
 
@@ -355,128 +223,9 @@ no_buffers:
        return ok;
 }
 
-/**
- * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
- * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
- *
- * This function allocates a number of Rx buffers from the reuse queue
- * or fill ring and places them on the Rx ring.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
-{
-       return __i40e_alloc_rx_buffers_zc(rx_ring, count,
-                                         i40e_alloc_buffer_slow_zc);
-}
-
-/**
- * i40e_alloc_rx_buffers_fast_zc - Allocates a number of Rx buffers
- * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
- *
- * This function allocates a number of Rx buffers from the fill ring
- * or the internal recycle mechanism and places them on the Rx ring.
- *
- * Returns true for a successful allocation, false otherwise
- **/
-static bool i40e_alloc_rx_buffers_fast_zc(struct i40e_ring *rx_ring, u16 count)
-{
-       return __i40e_alloc_rx_buffers_zc(rx_ring, count,
-                                         i40e_alloc_buffer_zc);
-}
-
-/**
- * i40e_get_rx_buffer_zc - Return the current Rx buffer
- * @rx_ring: Rx ring
- * @size: The size of the rx buffer (read from descriptor)
- *
- * This function returns the current, received Rx buffer, and also
- * does DMA synchronization.  the Rx ring.
- *
- * Returns the received Rx buffer
- **/
-static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring,
-                                                   const unsigned int size)
-{
-       struct i40e_rx_buffer *bi;
-
-       bi = &rx_ring->rx_bi[rx_ring->next_to_clean];
-
-       /* we are reusing so sync this buffer for CPU use */
-       dma_sync_single_range_for_cpu(rx_ring->dev,
-                                     bi->dma, 0,
-                                     size,
-                                     DMA_BIDIRECTIONAL);
-
-       return bi;
-}
-
-/**
- * i40e_reuse_rx_buffer_zc - Recycle an Rx buffer
- * @rx_ring: Rx ring
- * @old_bi: The Rx buffer to recycle
- *
- * This function recycles a finished Rx buffer, and places it on the
- * recycle queue (next_to_alloc).
- **/
-static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
-                                   struct i40e_rx_buffer *old_bi)
-{
-       struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
-       u16 nta = rx_ring->next_to_alloc;
-
-       /* update, and store next to alloc */
-       nta++;
-       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-       /* transfer page from old buffer to new buffer */
-       new_bi->dma = old_bi->dma;
-       new_bi->addr = old_bi->addr;
-       new_bi->handle = old_bi->handle;
-
-       old_bi->addr = NULL;
-}
-
-/**
- * i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations
- * @alloc: Zero-copy allocator
- * @handle: Buffer handle
- **/
-void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
-{
-       struct i40e_rx_buffer *bi;
-       struct i40e_ring *rx_ring;
-       u64 hr, mask;
-       u16 nta;
-
-       rx_ring = container_of(alloc, struct i40e_ring, zca);
-       hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
-       mask = rx_ring->xsk_umem->chunk_mask;
-
-       nta = rx_ring->next_to_alloc;
-       bi = &rx_ring->rx_bi[nta];
-
-       nta++;
-       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-       handle &= mask;
-
-       bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
-       bi->dma += hr;
-
-       bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
-       bi->addr += hr;
-
-       bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
-                                           rx_ring->xsk_umem->headroom);
-}
-
 /**
  * i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer
  * @rx_ring: Rx ring
- * @bi: Rx buffer
  * @xdp: xdp_buff
  *
  * This functions allocates a new skb from a zero-copy Rx buffer.
@@ -484,7 +233,6 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
  * Returns the skb, or NULL on failure.
  **/
 static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
-                                            struct i40e_rx_buffer *bi,
                                             struct xdp_buff *xdp)
 {
        unsigned int metasize = xdp->data - xdp->data_meta;
@@ -503,23 +251,10 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
        if (metasize)
                skb_metadata_set(skb, metasize);
 
-       i40e_reuse_rx_buffer_zc(rx_ring, bi);
+       xsk_buff_free(xdp);
        return skb;
 }
 
-/**
- * i40e_inc_ntc: Advance the next_to_clean index
- * @rx_ring: Rx ring
- **/
-static void i40e_inc_ntc(struct i40e_ring *rx_ring)
-{
-       u32 ntc = rx_ring->next_to_clean + 1;
-
-       ntc = (ntc < rx_ring->count) ? ntc : 0;
-       rx_ring->next_to_clean = ntc;
-       prefetch(I40E_RX_DESC(rx_ring, ntc));
-}
-
 /**
  * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
  * @rx_ring: Rx ring
@@ -531,25 +266,20 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 {
        unsigned int total_rx_bytes = 0, total_rx_packets = 0;
        u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
-       struct xdp_umem *umem = rx_ring->xsk_umem;
        unsigned int xdp_res, xdp_xmit = 0;
        bool failure = false;
        struct sk_buff *skb;
-       struct xdp_buff xdp;
-
-       xdp.rxq = &rx_ring->xdp_rxq;
-       xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
 
        while (likely(total_rx_packets < (unsigned int)budget)) {
-               struct i40e_rx_buffer *bi;
                union i40e_rx_desc *rx_desc;
+               struct xdp_buff **bi;
                unsigned int size;
                u64 qword;
 
                if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
                        failure = failure ||
-                                 !i40e_alloc_rx_buffers_fast_zc(rx_ring,
-                                                                cleaned_count);
+                                 !i40e_alloc_rx_buffers_zc(rx_ring,
+                                                           cleaned_count);
                        cleaned_count = 0;
                }
 
@@ -562,35 +292,36 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
                 */
                dma_rmb();
 
-               bi = i40e_clean_programming_status(rx_ring, rx_desc,
-                                                  qword);
-               if (unlikely(bi)) {
-                       i40e_reuse_rx_buffer_zc(rx_ring, bi);
+               if (i40e_rx_is_programming_status(qword)) {
+                       i40e_clean_programming_status(rx_ring,
+                                                     rx_desc->raw.qword[0],
+                                                     qword);
+                       bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+                       xsk_buff_free(*bi);
+                       *bi = NULL;
                        cleaned_count++;
+                       i40e_inc_ntc(rx_ring);
                        continue;
                }
 
+               bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
                size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
                       I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
                if (!size)
                        break;
 
-               bi = i40e_get_rx_buffer_zc(rx_ring, size);
-               xdp.data = bi->addr;
-               xdp.data_meta = xdp.data;
-               xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
-               xdp.data_end = xdp.data + size;
-               xdp.handle = bi->handle;
+               bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+               (*bi)->data_end = (*bi)->data + size;
+               xsk_buff_dma_sync_for_cpu(*bi);
 
-               xdp_res = i40e_run_xdp_zc(rx_ring, &xdp);
+               xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
                if (xdp_res) {
-                       if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
+                       if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR))
                                xdp_xmit |= xdp_res;
-                               bi->addr = NULL;
-                       } else {
-                               i40e_reuse_rx_buffer_zc(rx_ring, bi);
-                       }
+                       else
+                               xsk_buff_free(*bi);
 
+                       *bi = NULL;
                        total_rx_bytes += size;
                        total_rx_packets++;
 
@@ -606,7 +337,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
                 * BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that
                 * SBP is *not* set in PRT_SBPVSI (default not set).
                 */
-               skb = i40e_construct_skb_zc(rx_ring, bi, &xdp);
+               skb = i40e_construct_skb_zc(rx_ring, *bi);
+               *bi = NULL;
                if (!skb) {
                        rx_ring->rx_stats.alloc_buff_failed++;
                        break;
@@ -664,10 +396,9 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
                if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
                        break;
 
-               dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
-               dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
-                                          DMA_BIDIRECTIONAL);
+               dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+               xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+                                                desc.len);
 
                tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
                tx_bi->bytecount = desc.len;
@@ -826,13 +557,13 @@ void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring)
        u16 i;
 
        for (i = 0; i < rx_ring->count; i++) {
-               struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+               struct xdp_buff *rx_bi = *i40e_rx_bi(rx_ring, i);
 
-               if (!rx_bi->addr)
+               if (!rx_bi)
                        continue;
 
-               xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_bi->handle);
-               rx_bi->addr = NULL;
+               xsk_buff_free(rx_bi);
+               rx_bi = NULL;
        }
 }
 
index 9ed59c14eb55f931c05bca8f01a772bff390fa71..ea919a7d60ec19aacbd0c45e0249fb0cf6221624 100644 (file)
@@ -12,12 +12,13 @@ int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair);
 int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair);
 int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
                        u16 qid);
-void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
 bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
 int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
 
 bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
                           struct i40e_ring *tx_ring, int napi_budget);
 int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
+int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring);
+void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring);
 
 #endif /* _I40E_XSK_H_ */
index 00c072f61a32c7f421284c3742ee13117940529e..94d833b4e745b251dd392ef2d450a7da67bf050f 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019, Intel Corporation. */
 
+#include <net/xdp_sock_drv.h>
 #include "ice_base.h"
 #include "ice_dcb_lib.h"
 
@@ -309,24 +310,23 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
                if (ring->xsk_umem) {
                        xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
 
-                       ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
-                                          XDP_PACKET_HEADROOM;
+                       ring->rx_buf_len =
+                               xsk_umem_get_rx_frame_size(ring->xsk_umem);
                        /* For AF_XDP ZC, we disallow packets to span on
                         * multiple buffers, thus letting us skip that
                         * handling in the fast-path.
                         */
                        chain_len = 1;
-                       ring->zca.free = ice_zca_free;
                        err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
-                                                        MEM_TYPE_ZERO_COPY,
-                                                        &ring->zca);
+                                                        MEM_TYPE_XSK_BUFF_POOL,
+                                                        NULL);
                        if (err)
                                return err;
+                       xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
 
-                       dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
+                       dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
                                 ring->q_index);
                } else {
-                       ring->zca.free = NULL;
                        if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
                                /* coverity[check_return] */
                                xdp_rxq_info_reg(&ring->xdp_rxq,
@@ -427,7 +427,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
        writel(0, ring->tail);
 
        err = ring->xsk_umem ?
-             ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) :
+             ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring)) :
              ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring));
        if (err)
                dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n",
index 7c4030caeea4dab2fc0479bd98a907adda16c451..cf21b4fe928a01d60f31d5183692f4679b3d4479 100644 (file)
@@ -162,17 +162,16 @@ struct ice_tx_offload_params {
 };
 
 struct ice_rx_buf {
-       struct sk_buff *skb;
-       dma_addr_t dma;
        union {
                struct {
+                       struct sk_buff *skb;
+                       dma_addr_t dma;
                        struct page *page;
                        unsigned int page_offset;
                        u16 pagecnt_bias;
                };
                struct {
-                       void *addr;
-                       u64 handle;
+                       struct xdp_buff *xdp;
                };
        };
 };
@@ -296,7 +295,6 @@ struct ice_ring {
        struct rcu_head rcu;            /* to avoid race on free */
        struct bpf_prog *xdp_prog;
        struct xdp_umem *xsk_umem;
-       struct zero_copy_allocator zca;
        /* CL3 - 3rd cacheline starts here */
        struct xdp_rxq_info xdp_rxq;
        /* CLX - the below items are only accessed infrequently and should be
index 20ac54e3156dcc4d72765004617c8ddc6b0cc57f..b6f928c9e9c98cd7cc396967e1383aba09c0e41e 100644 (file)
@@ -2,7 +2,7 @@
 /* Copyright (c) 2019, Intel Corporation. */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 #include "ice.h"
 #include "ice_base.h"
@@ -279,28 +279,6 @@ static int ice_xsk_alloc_umems(struct ice_vsi *vsi)
        return 0;
 }
 
-/**
- * ice_xsk_add_umem - add a UMEM region for XDP sockets
- * @vsi: VSI to which the UMEM will be added
- * @umem: pointer to a requested UMEM region
- * @qid: queue ID
- *
- * Returns 0 on success, negative on error
- */
-static int ice_xsk_add_umem(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
-{
-       int err;
-
-       err = ice_xsk_alloc_umems(vsi);
-       if (err)
-               return err;
-
-       vsi->xsk_umems[qid] = umem;
-       vsi->num_xsk_umems_used++;
-
-       return 0;
-}
-
 /**
  * ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid
  * @vsi: VSI from which the VSI will be removed
@@ -318,65 +296,6 @@ static void ice_xsk_remove_umem(struct ice_vsi *vsi, u16 qid)
        }
 }
 
-/**
- * ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets
- * @vsi: VSI to map the UMEM region
- * @umem: UMEM to map
- *
- * Returns 0 on success, negative on error
- */
-static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem)
-{
-       struct ice_pf *pf = vsi->back;
-       struct device *dev;
-       unsigned int i;
-
-       dev = ice_pf_to_dev(pf);
-       for (i = 0; i < umem->npgs; i++) {
-               dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0,
-                                                   PAGE_SIZE,
-                                                   DMA_BIDIRECTIONAL,
-                                                   ICE_RX_DMA_ATTR);
-               if (dma_mapping_error(dev, dma)) {
-                       dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d\n",
-                               i);
-                       goto out_unmap;
-               }
-
-               umem->pages[i].dma = dma;
-       }
-
-       return 0;
-
-out_unmap:
-       for (; i > 0; i--) {
-               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
-                                    DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
-               umem->pages[i].dma = 0;
-       }
-
-       return -EFAULT;
-}
-
-/**
- * ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets
- * @vsi: VSI from which the UMEM will be unmapped
- * @umem: UMEM to unmap
- */
-static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem)
-{
-       struct ice_pf *pf = vsi->back;
-       struct device *dev;
-       unsigned int i;
-
-       dev = ice_pf_to_dev(pf);
-       for (i = 0; i < umem->npgs; i++) {
-               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
-                                    DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
-
-               umem->pages[i].dma = 0;
-       }
-}
 
 /**
  * ice_xsk_umem_disable - disable a UMEM region
@@ -391,7 +310,7 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
            !vsi->xsk_umems[qid])
                return -EINVAL;
 
-       ice_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]);
+       xsk_buff_dma_unmap(vsi->xsk_umems[qid], ICE_RX_DMA_ATTR);
        ice_xsk_remove_umem(vsi, qid);
 
        return 0;
@@ -408,7 +327,6 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
 static int
 ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
 {
-       struct xdp_umem_fq_reuse *reuseq;
        int err;
 
        if (vsi->type != ICE_VSI_PF)
@@ -419,20 +337,18 @@ ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
        if (qid >= vsi->num_xsk_umems)
                return -EINVAL;
 
+       err = ice_xsk_alloc_umems(vsi);
+       if (err)
+               return err;
+
        if (vsi->xsk_umems && vsi->xsk_umems[qid])
                return -EBUSY;
 
-       reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
-       if (!reuseq)
-               return -ENOMEM;
-
-       xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
-       err = ice_xsk_umem_dma_map(vsi, umem);
-       if (err)
-               return err;
+       vsi->xsk_umems[qid] = umem;
+       vsi->num_xsk_umems_used++;
 
-       err = ice_xsk_add_umem(vsi, umem, qid);
+       err = xsk_buff_dma_map(vsi->xsk_umems[qid], ice_pf_to_dev(vsi->back),
+                              ICE_RX_DMA_ATTR);
        if (err)
                return err;
 
@@ -483,138 +399,23 @@ xsk_umem_if_up:
        return ret;
 }
 
-/**
- * ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations
- * @zca: zero-cpoy allocator
- * @handle: Buffer handle
- */
-void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
-{
-       struct ice_rx_buf *rx_buf;
-       struct ice_ring *rx_ring;
-       struct xdp_umem *umem;
-       u64 hr, mask;
-       u16 nta;
-
-       rx_ring = container_of(zca, struct ice_ring, zca);
-       umem = rx_ring->xsk_umem;
-       hr = umem->headroom + XDP_PACKET_HEADROOM;
-
-       mask = umem->chunk_mask;
-
-       nta = rx_ring->next_to_alloc;
-       rx_buf = &rx_ring->rx_buf[nta];
-
-       nta++;
-       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-       handle &= mask;
-
-       rx_buf->dma = xdp_umem_get_dma(umem, handle);
-       rx_buf->dma += hr;
-
-       rx_buf->addr = xdp_umem_get_data(umem, handle);
-       rx_buf->addr += hr;
-
-       rx_buf->handle = (u64)handle + umem->headroom;
-}
-
-/**
- * ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem
- * @rx_ring: ring with an xdp_umem bound to it
- * @rx_buf: buffer to which xsk page address will be assigned
- *
- * This function allocates an Rx buffer in the hot path.
- * The buffer can come from fill queue or recycle queue.
- *
- * Returns true if an assignment was successful, false if not.
- */
-static __always_inline bool
-ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
-{
-       struct xdp_umem *umem = rx_ring->xsk_umem;
-       void *addr = rx_buf->addr;
-       u64 handle, hr;
-
-       if (addr) {
-               rx_ring->rx_stats.page_reuse_count++;
-               return true;
-       }
-
-       if (!xsk_umem_peek_addr(umem, &handle)) {
-               rx_ring->rx_stats.alloc_page_failed++;
-               return false;
-       }
-
-       hr = umem->headroom + XDP_PACKET_HEADROOM;
-
-       rx_buf->dma = xdp_umem_get_dma(umem, handle);
-       rx_buf->dma += hr;
-
-       rx_buf->addr = xdp_umem_get_data(umem, handle);
-       rx_buf->addr += hr;
-
-       rx_buf->handle = handle + umem->headroom;
-
-       xsk_umem_release_addr(umem);
-       return true;
-}
-
-/**
- * ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem
- * @rx_ring: ring with an xdp_umem bound to it
- * @rx_buf: buffer to which xsk page address will be assigned
- *
- * This function allocates an Rx buffer in the slow path.
- * The buffer can come from fill queue or recycle queue.
- *
- * Returns true if an assignment was successful, false if not.
- */
-static __always_inline bool
-ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
-{
-       struct xdp_umem *umem = rx_ring->xsk_umem;
-       u64 handle, headroom;
-
-       if (!xsk_umem_peek_addr_rq(umem, &handle)) {
-               rx_ring->rx_stats.alloc_page_failed++;
-               return false;
-       }
-
-       handle &= umem->chunk_mask;
-       headroom = umem->headroom + XDP_PACKET_HEADROOM;
-
-       rx_buf->dma = xdp_umem_get_dma(umem, handle);
-       rx_buf->dma += headroom;
-
-       rx_buf->addr = xdp_umem_get_data(umem, handle);
-       rx_buf->addr += headroom;
-
-       rx_buf->handle = handle + umem->headroom;
-
-       xsk_umem_release_addr_rq(umem);
-       return true;
-}
-
 /**
  * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
  * @rx_ring: Rx ring
  * @count: The number of buffers to allocate
- * @alloc: the function pointer to call for allocation
  *
  * This function allocates a number of Rx buffers from the fill ring
  * or the internal recycle mechanism and places them on the Rx ring.
  *
  * Returns false if all allocations were successful, true if any fail.
  */
-static bool
-ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
-                    bool (*alloc)(struct ice_ring *, struct ice_rx_buf *))
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
 {
        union ice_32b_rx_flex_desc *rx_desc;
        u16 ntu = rx_ring->next_to_use;
        struct ice_rx_buf *rx_buf;
        bool ret = false;
+       dma_addr_t dma;
 
        if (!count)
                return false;
@@ -623,16 +424,14 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
        rx_buf = &rx_ring->rx_buf[ntu];
 
        do {
-               if (!alloc(rx_ring, rx_buf)) {
+               rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+               if (!rx_buf->xdp) {
                        ret = true;
                        break;
                }
 
-               dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0,
-                                                rx_ring->rx_buf_len,
-                                                DMA_BIDIRECTIONAL);
-
-               rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma);
+               dma = xsk_buff_xdp_get_dma(rx_buf->xdp);
+               rx_desc->read.pkt_addr = cpu_to_le64(dma);
                rx_desc->wb.status_error0 = 0;
 
                rx_desc++;
@@ -652,32 +451,6 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
        return ret;
 }
 
-/**
- * ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path
- * @rx_ring: Rx ring
- * @count: number of bufs to allocate
- *
- * Returns false on success, true on failure.
- */
-static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count)
-{
-       return ice_alloc_rx_bufs_zc(rx_ring, count,
-                                   ice_alloc_buf_fast_zc);
-}
-
-/**
- * ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path
- * @rx_ring: Rx ring
- * @count: number of bufs to allocate
- *
- * Returns false on success, true on failure.
- */
-bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count)
-{
-       return ice_alloc_rx_bufs_zc(rx_ring, count,
-                                   ice_alloc_buf_slow_zc);
-}
-
 /**
  * ice_bump_ntc - Bump the next_to_clean counter of an Rx ring
  * @rx_ring: Rx ring
@@ -691,77 +464,22 @@ static void ice_bump_ntc(struct ice_ring *rx_ring)
        prefetch(ICE_RX_DESC(rx_ring, ntc));
 }
 
-/**
- * ice_get_rx_buf_zc - Fetch the current Rx buffer
- * @rx_ring: Rx ring
- * @size: size of a buffer
- *
- * This function returns the current, received Rx buffer and does
- * DMA synchronization.
- *
- * Returns a pointer to the received Rx buffer.
- */
-static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size)
-{
-       struct ice_rx_buf *rx_buf;
-
-       rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
-
-       dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0,
-                                     size, DMA_BIDIRECTIONAL);
-
-       return rx_buf;
-}
-
-/**
- * ice_reuse_rx_buf_zc - reuse an Rx buffer
- * @rx_ring: Rx ring
- * @old_buf: The buffer to recycle
- *
- * This function recycles a finished Rx buffer, and places it on the recycle
- * queue (next_to_alloc).
- */
-static void
-ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf)
-{
-       unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
-       u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
-       u16 nta = rx_ring->next_to_alloc;
-       struct ice_rx_buf *new_buf;
-
-       new_buf = &rx_ring->rx_buf[nta++];
-       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-       new_buf->dma = old_buf->dma & mask;
-       new_buf->dma += hr;
-
-       new_buf->addr = (void *)((unsigned long)old_buf->addr & mask);
-       new_buf->addr += hr;
-
-       new_buf->handle = old_buf->handle & mask;
-       new_buf->handle += rx_ring->xsk_umem->headroom;
-
-       old_buf->addr = NULL;
-}
-
 /**
  * ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
  * @rx_ring: Rx ring
  * @rx_buf: zero-copy Rx buffer
- * @xdp: XDP buffer
  *
  * This function allocates a new skb from a zero-copy Rx buffer.
  *
  * Returns the skb on success, NULL on failure.
  */
 static struct sk_buff *
-ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
-                    struct xdp_buff *xdp)
+ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 {
-       unsigned int metasize = xdp->data - xdp->data_meta;
-       unsigned int datasize = xdp->data_end - xdp->data;
-       unsigned int datasize_hard = xdp->data_end -
-                                    xdp->data_hard_start;
+       unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
+       unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
+       unsigned int datasize_hard = rx_buf->xdp->data_end -
+                                    rx_buf->xdp->data_hard_start;
        struct sk_buff *skb;
 
        skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
@@ -769,13 +487,13 @@ ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
        if (unlikely(!skb))
                return NULL;
 
-       skb_reserve(skb, xdp->data - xdp->data_hard_start);
-       memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+       skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
+       memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
        if (metasize)
                skb_metadata_set(skb, metasize);
 
-       ice_reuse_rx_buf_zc(rx_ring, rx_buf);
-
+       xsk_buff_free(rx_buf->xdp);
+       rx_buf->xdp = NULL;
        return skb;
 }
 
@@ -802,7 +520,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp)
        }
 
        act = bpf_prog_run_xdp(xdp_prog, xdp);
-       xdp->handle += xdp->data - xdp->data_hard_start;
        switch (act) {
        case XDP_PASS:
                break;
@@ -840,13 +557,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
 {
        unsigned int total_rx_bytes = 0, total_rx_packets = 0;
        u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
-       struct xdp_umem *umem = rx_ring->xsk_umem;
        unsigned int xdp_xmit = 0;
        bool failure = false;
-       struct xdp_buff xdp;
-
-       xdp.rxq = &rx_ring->xdp_rxq;
-       xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
 
        while (likely(total_rx_packets < (unsigned int)budget)) {
                union ice_32b_rx_flex_desc *rx_desc;
@@ -858,8 +570,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
                u8 rx_ptype;
 
                if (cleaned_count >= ICE_RX_BUF_WRITE) {
-                       failure |= ice_alloc_rx_bufs_fast_zc(rx_ring,
-                                                            cleaned_count);
+                       failure |= ice_alloc_rx_bufs_zc(rx_ring,
+                                                       cleaned_count);
                        cleaned_count = 0;
                }
 
@@ -880,25 +592,19 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
                if (!size)
                        break;
 
-               rx_buf = ice_get_rx_buf_zc(rx_ring, size);
-               if (!rx_buf->addr)
-                       break;
 
-               xdp.data = rx_buf->addr;
-               xdp.data_meta = xdp.data;
-               xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
-               xdp.data_end = xdp.data + size;
-               xdp.handle = rx_buf->handle;
+               rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+               rx_buf->xdp->data_end = rx_buf->xdp->data + size;
+               xsk_buff_dma_sync_for_cpu(rx_buf->xdp);
 
-               xdp_res = ice_run_xdp_zc(rx_ring, &xdp);
+               xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
                if (xdp_res) {
-                       if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+                       if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
                                xdp_xmit |= xdp_res;
-                               rx_buf->addr = NULL;
-                       } else {
-                               ice_reuse_rx_buf_zc(rx_ring, rx_buf);
-                       }
+                       else
+                               xsk_buff_free(rx_buf->xdp);
 
+                       rx_buf->xdp = NULL;
                        total_rx_bytes += size;
                        total_rx_packets++;
                        cleaned_count++;
@@ -908,7 +614,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
                }
 
                /* XDP_PASS path */
-               skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp);
+               skb = ice_construct_skb_zc(rx_ring, rx_buf);
                if (!skb) {
                        rx_ring->rx_stats.alloc_buf_failed++;
                        break;
@@ -979,10 +685,9 @@ static bool ice_xmit_zc(struct ice_ring *xdp_ring, int budget)
                if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
                        break;
 
-               dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
-               dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
-                                          DMA_BIDIRECTIONAL);
+               dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+               xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+                                                desc.len);
 
                tx_buf->bytecount = desc.len;
 
@@ -1165,11 +870,10 @@ void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring)
        for (i = 0; i < rx_ring->count; i++) {
                struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
 
-               if (!rx_buf->addr)
+               if (!rx_buf->xdp)
                        continue;
 
-               xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_buf->handle);
-               rx_buf->addr = NULL;
+               rx_buf->xdp = NULL;
        }
 }
 
index 8a4ba7c6d549b656ce47f05d44dfe8f2f7c5d720..fc1a06b4df364cd971647e640f9546937d11e700 100644 (file)
@@ -10,11 +10,10 @@ struct ice_vsi;
 
 #ifdef CONFIG_XDP_SOCKETS
 int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid);
-void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
 int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget);
 bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring, int budget);
 int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
-bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count);
+bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count);
 bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi);
 void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring);
 void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring);
@@ -27,12 +26,6 @@ ice_xsk_umem_setup(struct ice_vsi __always_unused *vsi,
        return -EOPNOTSUPP;
 }
 
-static inline void
-ice_zca_free(struct zero_copy_allocator __always_unused *zca,
-            unsigned long __always_unused handle)
-{
-}
-
 static inline int
 ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring,
                    int __always_unused budget)
@@ -48,8 +41,8 @@ ice_clean_tx_irq_zc(struct ice_ring __always_unused *xdp_ring,
 }
 
 static inline bool
-ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring,
-                         u16 __always_unused count)
+ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring,
+                    u16 __always_unused count)
 {
        return false;
 }
index 39d3b76a6f5d4da80bb888c4d1c9c4ff1a42c23a..2cd003c5ad4319be1a2eae345e103075b95c9905 100644 (file)
@@ -143,7 +143,8 @@ static int igb_get_link_ksettings(struct net_device *netdev,
        u32 speed;
        u32 supported, advertising;
 
-       status = rd32(E1000_STATUS);
+       status = pm_runtime_suspended(&adapter->pdev->dev) ?
+                0 : rd32(E1000_STATUS);
        if (hw->phy.media_type == e1000_media_type_copper) {
 
                supported = (SUPPORTED_10baseT_Half |
index 812e1cd695cf766660db027981a7a4f090b9855e..14f9edaaaf834a4490b242aa5778f57ee1f5a71d 100644 (file)
@@ -16,8 +16,7 @@
 
 #include "igc_hw.h"
 
-/* forward declaration */
-void igc_set_ethtool_ops(struct net_device *);
+void igc_ethtool_set_ops(struct net_device *);
 
 /* Transmit and receive queues */
 #define IGC_MAX_RX_QUEUES              4
@@ -29,6 +28,11 @@ void igc_set_ethtool_ops(struct net_device *);
 #define MAX_ETYPE_FILTER               8
 #define IGC_RETA_SIZE                  128
 
+enum igc_mac_filter_type {
+       IGC_MAC_FILTER_TYPE_DST = 0,
+       IGC_MAC_FILTER_TYPE_SRC
+};
+
 struct igc_tx_queue_stats {
        u64 packets;
        u64 bytes;
@@ -183,14 +187,12 @@ struct igc_adapter {
        u32 rss_queues;
        u32 rss_indir_tbl_init;
 
-       /* RX network flow classification support */
-       struct hlist_head nfc_filter_list;
-       unsigned int nfc_filter_count;
-
-       /* lock for RX network flow classification filter */
-       spinlock_t nfc_lock;
-
-       struct igc_mac_addr *mac_table;
+       /* Any access to elements in nfc_rule_list is protected by the
+        * nfc_rule_lock.
+        */
+       struct mutex nfc_rule_lock;
+       struct list_head nfc_rule_list;
+       unsigned int nfc_rule_count;
 
        u8 rss_indir_tbl[IGC_RETA_SIZE];
 
@@ -230,15 +232,6 @@ void igc_write_rss_indir_tbl(struct igc_adapter *adapter);
 bool igc_has_link(struct igc_adapter *adapter);
 void igc_reset(struct igc_adapter *adapter);
 int igc_set_spd_dplx(struct igc_adapter *adapter, u32 spd, u8 dplx);
-int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr,
-                      const s8 queue, const u8 flags);
-int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr,
-                      const u8 flags);
-int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio,
-                            int queue);
-void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio);
-int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype, int queue);
-int igc_del_etype_filter(struct igc_adapter *adapter, u16 etype);
 void igc_update_stats(struct igc_adapter *adapter);
 
 /* igc_dump declarations */
@@ -449,39 +442,22 @@ enum igc_filter_match_flags {
        IGC_FILTER_FLAG_DST_MAC_ADDR =  0x8,
 };
 
-/* RX network flow classification data structure */
-struct igc_nfc_input {
-       /* Byte layout in order, all values with MSB first:
-        * match_flags - 1 byte
-        * etype - 2 bytes
-        * vlan_tci - 2 bytes
-        */
+struct igc_nfc_filter {
        u8 match_flags;
-       __be16 etype;
-       __be16 vlan_tci;
+       u16 etype;
+       u16 vlan_tci;
        u8 src_addr[ETH_ALEN];
        u8 dst_addr[ETH_ALEN];
 };
 
-struct igc_nfc_filter {
-       struct hlist_node nfc_node;
-       struct igc_nfc_input filter;
-       unsigned long cookie;
-       u16 sw_idx;
+struct igc_nfc_rule {
+       struct list_head list;
+       struct igc_nfc_filter filter;
+       u32 location;
        u16 action;
 };
 
-struct igc_mac_addr {
-       u8 addr[ETH_ALEN];
-       s8 queue;
-       u8 state; /* bitmask */
-};
-
-#define IGC_MAC_STATE_DEFAULT          0x1
-#define IGC_MAC_STATE_IN_USE           0x2
-#define IGC_MAC_STATE_SRC_ADDR         0x4
-
-#define IGC_MAX_RXNFC_FILTERS          16
+#define IGC_MAX_RXNFC_RULES            16
 
 /* igc_desc_unused - calculate if we have unused descriptors */
 static inline u16 igc_desc_unused(const struct igc_ring *ring)
@@ -557,12 +533,11 @@ static inline s32 igc_read_phy_reg(struct igc_hw *hw, u32 offset, u16 *data)
        return 0;
 }
 
-/* forward declaration */
 void igc_reinit_locked(struct igc_adapter *);
-int igc_add_filter(struct igc_adapter *adapter,
-                  struct igc_nfc_filter *input);
-int igc_erase_filter(struct igc_adapter *adapter,
-                    struct igc_nfc_filter *input);
+struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter,
+                                     u32 location);
+int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
+void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
 
 void igc_ptp_init(struct igc_adapter *adapter);
 void igc_ptp_reset(struct igc_adapter *adapter);
index 51d8a15e239c19593f3cdff4e90c4562d4f99bcf..3d8d40d6fa3f3bf8f7b134d43ecd0cf28480f33a 100644 (file)
@@ -62,6 +62,9 @@
  * (RAR[15]) for our directed address used by controllers with
  * manageability enabled, allowing us room for 15 multicast addresses.
  */
+#define IGC_RAH_RAH_MASK       0x0000FFFF
+#define IGC_RAH_ASEL_MASK      0x00030000
+#define IGC_RAH_ASEL_SRC_ADDR  BIT(16)
 #define IGC_RAH_QSEL_MASK      0x000C0000
 #define IGC_RAH_QSEL_SHIFT     18
 #define IGC_RAH_QSEL_ENABLE    BIT(28)
 #define IGC_TXD_POPTS_IXSM     0x01       /* Insert IP checksum */
 #define IGC_TXD_POPTS_TXSM     0x02       /* Insert TCP/UDP checksum */
 #define IGC_TXD_CMD_EOP                0x01000000 /* End of Packet */
-#define IGC_TXD_CMD_IFCS       0x02000000 /* Insert FCS (Ethernet CRC) */
 #define IGC_TXD_CMD_IC         0x04000000 /* Insert Checksum */
-#define IGC_TXD_CMD_RS         0x08000000 /* Report Status */
-#define IGC_TXD_CMD_RPS                0x10000000 /* Report Packet Sent */
 #define IGC_TXD_CMD_DEXT       0x20000000 /* Desc extension (0 = legacy) */
 #define IGC_TXD_CMD_VLE                0x40000000 /* Add VLAN tag */
-#define IGC_TXD_CMD_IDE                0x80000000 /* Enable Tidv register */
 #define IGC_TXD_STAT_DD                0x00000001 /* Descriptor Done */
 #define IGC_TXD_STAT_EC                0x00000002 /* Excess Collisions */
 #define IGC_TXD_STAT_LC                0x00000004 /* Late Collisions */
index c6586e2be3a8396211bcd4564c7c154f24dc1504..946e775e34aea6d60535374fa8613ce659d042f6 100644 (file)
@@ -124,8 +124,8 @@ static const char igc_priv_flags_strings[][ETH_GSTRING_LEN] = {
 
 #define IGC_PRIV_FLAGS_STR_LEN ARRAY_SIZE(igc_priv_flags_strings)
 
-static void igc_get_drvinfo(struct net_device *netdev,
-                           struct ethtool_drvinfo *drvinfo)
+static void igc_ethtool_get_drvinfo(struct net_device *netdev,
+                                   struct ethtool_drvinfo *drvinfo)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
@@ -139,13 +139,13 @@ static void igc_get_drvinfo(struct net_device *netdev,
        drvinfo->n_priv_flags = IGC_PRIV_FLAGS_STR_LEN;
 }
 
-static int igc_get_regs_len(struct net_device *netdev)
+static int igc_ethtool_get_regs_len(struct net_device *netdev)
 {
        return IGC_REGS_LEN * sizeof(u32);
 }
 
-static void igc_get_regs(struct net_device *netdev,
-                        struct ethtool_regs *regs, void *p)
+static void igc_ethtool_get_regs(struct net_device *netdev,
+                                struct ethtool_regs *regs, void *p)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_hw *hw = &adapter->hw;
@@ -323,7 +323,8 @@ static void igc_get_regs(struct net_device *netdev,
                regs_buff[205 + i] = rd32(IGC_ETQF(i));
 }
 
-static void igc_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+static void igc_ethtool_get_wol(struct net_device *netdev,
+                               struct ethtool_wolinfo *wol)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
@@ -354,7 +355,8 @@ static void igc_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
                wol->wolopts |= WAKE_PHY;
 }
 
-static int igc_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+static int igc_ethtool_set_wol(struct net_device *netdev,
+                              struct ethtool_wolinfo *wol)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
@@ -382,21 +384,21 @@ static int igc_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
        return 0;
 }
 
-static u32 igc_get_msglevel(struct net_device *netdev)
+static u32 igc_ethtool_get_msglevel(struct net_device *netdev)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
        return adapter->msg_enable;
 }
 
-static void igc_set_msglevel(struct net_device *netdev, u32 data)
+static void igc_ethtool_set_msglevel(struct net_device *netdev, u32 data)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
        adapter->msg_enable = data;
 }
 
-static int igc_nway_reset(struct net_device *netdev)
+static int igc_ethtool_nway_reset(struct net_device *netdev)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
@@ -405,7 +407,7 @@ static int igc_nway_reset(struct net_device *netdev)
        return 0;
 }
 
-static u32 igc_get_link(struct net_device *netdev)
+static u32 igc_ethtool_get_link(struct net_device *netdev)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_mac_info *mac = &adapter->hw.mac;
@@ -422,15 +424,15 @@ static u32 igc_get_link(struct net_device *netdev)
        return igc_has_link(adapter);
 }
 
-static int igc_get_eeprom_len(struct net_device *netdev)
+static int igc_ethtool_get_eeprom_len(struct net_device *netdev)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
        return adapter->hw.nvm.word_size * 2;
 }
 
-static int igc_get_eeprom(struct net_device *netdev,
-                         struct ethtool_eeprom *eeprom, u8 *bytes)
+static int igc_ethtool_get_eeprom(struct net_device *netdev,
+                                 struct ethtool_eeprom *eeprom, u8 *bytes)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_hw *hw = &adapter->hw;
@@ -476,8 +478,8 @@ static int igc_get_eeprom(struct net_device *netdev,
        return ret_val;
 }
 
-static int igc_set_eeprom(struct net_device *netdev,
-                         struct ethtool_eeprom *eeprom, u8 *bytes)
+static int igc_ethtool_set_eeprom(struct net_device *netdev,
+                                 struct ethtool_eeprom *eeprom, u8 *bytes)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_hw *hw = &adapter->hw;
@@ -544,8 +546,8 @@ static int igc_set_eeprom(struct net_device *netdev,
        return ret_val;
 }
 
-static void igc_get_ringparam(struct net_device *netdev,
-                             struct ethtool_ringparam *ring)
+static void igc_ethtool_get_ringparam(struct net_device *netdev,
+                                     struct ethtool_ringparam *ring)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
@@ -555,8 +557,8 @@ static void igc_get_ringparam(struct net_device *netdev,
        ring->tx_pending = adapter->tx_ring_count;
 }
 
-static int igc_set_ringparam(struct net_device *netdev,
-                            struct ethtool_ringparam *ring)
+static int igc_ethtool_set_ringparam(struct net_device *netdev,
+                                    struct ethtool_ringparam *ring)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_ring *temp_ring;
@@ -670,8 +672,8 @@ clear_reset:
        return err;
 }
 
-static void igc_get_pauseparam(struct net_device *netdev,
-                              struct ethtool_pauseparam *pause)
+static void igc_ethtool_get_pauseparam(struct net_device *netdev,
+                                      struct ethtool_pauseparam *pause)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_hw *hw = &adapter->hw;
@@ -689,8 +691,8 @@ static void igc_get_pauseparam(struct net_device *netdev,
        }
 }
 
-static int igc_set_pauseparam(struct net_device *netdev,
-                             struct ethtool_pauseparam *pause)
+static int igc_ethtool_set_pauseparam(struct net_device *netdev,
+                                     struct ethtool_pauseparam *pause)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_hw *hw = &adapter->hw;
@@ -729,7 +731,8 @@ static int igc_set_pauseparam(struct net_device *netdev,
        return retval;
 }
 
-static void igc_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+static void igc_ethtool_get_strings(struct net_device *netdev, u32 stringset,
+                                   u8 *data)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        u8 *p = data;
@@ -780,7 +783,7 @@ static void igc_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
        }
 }
 
-static int igc_get_sset_count(struct net_device *netdev, int sset)
+static int igc_ethtool_get_sset_count(struct net_device *netdev, int sset)
 {
        switch (sset) {
        case ETH_SS_STATS:
@@ -794,7 +797,7 @@ static int igc_get_sset_count(struct net_device *netdev, int sset)
        }
 }
 
-static void igc_get_ethtool_stats(struct net_device *netdev,
+static void igc_ethtool_get_stats(struct net_device *netdev,
                                  struct ethtool_stats *stats, u64 *data)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
@@ -850,8 +853,8 @@ static void igc_get_ethtool_stats(struct net_device *netdev,
        spin_unlock(&adapter->stats64_lock);
 }
 
-static int igc_get_coalesce(struct net_device *netdev,
-                           struct ethtool_coalesce *ec)
+static int igc_ethtool_get_coalesce(struct net_device *netdev,
+                                   struct ethtool_coalesce *ec)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
@@ -870,8 +873,8 @@ static int igc_get_coalesce(struct net_device *netdev,
        return 0;
 }
 
-static int igc_set_coalesce(struct net_device *netdev,
-                           struct ethtool_coalesce *ec)
+static int igc_ethtool_set_coalesce(struct net_device *netdev,
+                                   struct ethtool_coalesce *ec)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        int i;
@@ -928,81 +931,83 @@ static int igc_set_coalesce(struct net_device *netdev,
 }
 
 #define ETHER_TYPE_FULL_MASK ((__force __be16)~0)
-static int igc_get_ethtool_nfc_entry(struct igc_adapter *adapter,
-                                    struct ethtool_rxnfc *cmd)
+static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter,
+                                   struct ethtool_rxnfc *cmd)
 {
        struct ethtool_rx_flow_spec *fsp = &cmd->fs;
-       struct igc_nfc_filter *rule = NULL;
+       struct igc_nfc_rule *rule = NULL;
 
-       /* report total rule count */
-       cmd->data = IGC_MAX_RXNFC_FILTERS;
+       cmd->data = IGC_MAX_RXNFC_RULES;
 
-       hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
-               if (fsp->location <= rule->sw_idx)
-                       break;
+       mutex_lock(&adapter->nfc_rule_lock);
+
+       rule = igc_get_nfc_rule(adapter, fsp->location);
+       if (!rule)
+               goto out;
+
+       fsp->flow_type = ETHER_FLOW;
+       fsp->ring_cookie = rule->action;
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
+               fsp->h_u.ether_spec.h_proto = htons(rule->filter.etype);
+               fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK;
        }
 
-       if (!rule || fsp->location != rule->sw_idx)
-               return -EINVAL;
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+               fsp->flow_type |= FLOW_EXT;
+               fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci);
+               fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
+       }
 
-       if (rule->filter.match_flags) {
-               fsp->flow_type = ETHER_FLOW;
-               fsp->ring_cookie = rule->action;
-               if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
-                       fsp->h_u.ether_spec.h_proto = rule->filter.etype;
-                       fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK;
-               }
-               if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
-                       fsp->flow_type |= FLOW_EXT;
-                       fsp->h_ext.vlan_tci = rule->filter.vlan_tci;
-                       fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
-               }
-               if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
-                       ether_addr_copy(fsp->h_u.ether_spec.h_dest,
-                                       rule->filter.dst_addr);
-                       /* As we only support matching by the full
-                        * mask, return the mask to userspace
-                        */
-                       eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
-               }
-               if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
-                       ether_addr_copy(fsp->h_u.ether_spec.h_source,
-                                       rule->filter.src_addr);
-                       /* As we only support matching by the full
-                        * mask, return the mask to userspace
-                        */
-                       eth_broadcast_addr(fsp->m_u.ether_spec.h_source);
-               }
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
+               ether_addr_copy(fsp->h_u.ether_spec.h_dest,
+                               rule->filter.dst_addr);
+               eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
+       }
 
-               return 0;
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
+               ether_addr_copy(fsp->h_u.ether_spec.h_source,
+                               rule->filter.src_addr);
+               eth_broadcast_addr(fsp->m_u.ether_spec.h_source);
        }
+
+       mutex_unlock(&adapter->nfc_rule_lock);
+       return 0;
+
+out:
+       mutex_unlock(&adapter->nfc_rule_lock);
        return -EINVAL;
 }
 
-static int igc_get_ethtool_nfc_all(struct igc_adapter *adapter,
-                                  struct ethtool_rxnfc *cmd,
-                                  u32 *rule_locs)
+static int igc_ethtool_get_nfc_rules(struct igc_adapter *adapter,
+                                    struct ethtool_rxnfc *cmd,
+                                    u32 *rule_locs)
 {
-       struct igc_nfc_filter *rule;
+       struct igc_nfc_rule *rule;
        int cnt = 0;
 
-       /* report total rule count */
-       cmd->data = IGC_MAX_RXNFC_FILTERS;
+       cmd->data = IGC_MAX_RXNFC_RULES;
+
+       mutex_lock(&adapter->nfc_rule_lock);
 
-       hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
-               if (cnt == cmd->rule_cnt)
+       list_for_each_entry(rule, &adapter->nfc_rule_list, list) {
+               if (cnt == cmd->rule_cnt) {
+                       mutex_unlock(&adapter->nfc_rule_lock);
                        return -EMSGSIZE;
-               rule_locs[cnt] = rule->sw_idx;
+               }
+               rule_locs[cnt] = rule->location;
                cnt++;
        }
 
+       mutex_unlock(&adapter->nfc_rule_lock);
+
        cmd->rule_cnt = cnt;
 
        return 0;
 }
 
-static int igc_get_rss_hash_opts(struct igc_adapter *adapter,
-                                struct ethtool_rxnfc *cmd)
+static int igc_ethtool_get_rss_hash_opts(struct igc_adapter *adapter,
+                                        struct ethtool_rxnfc *cmd)
 {
        cmd->data = 0;
 
@@ -1051,41 +1056,33 @@ static int igc_get_rss_hash_opts(struct igc_adapter *adapter,
        return 0;
 }
 
-static int igc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
-                        u32 *rule_locs)
+static int igc_ethtool_get_rxnfc(struct net_device *dev,
+                                struct ethtool_rxnfc *cmd, u32 *rule_locs)
 {
        struct igc_adapter *adapter = netdev_priv(dev);
-       int ret = -EOPNOTSUPP;
 
        switch (cmd->cmd) {
        case ETHTOOL_GRXRINGS:
                cmd->data = adapter->num_rx_queues;
-               ret = 0;
-               break;
+               return 0;
        case ETHTOOL_GRXCLSRLCNT:
-               cmd->rule_cnt = adapter->nfc_filter_count;
-               ret = 0;
-               break;
+               cmd->rule_cnt = adapter->nfc_rule_count;
+               return 0;
        case ETHTOOL_GRXCLSRULE:
-               ret = igc_get_ethtool_nfc_entry(adapter, cmd);
-               break;
+               return igc_ethtool_get_nfc_rule(adapter, cmd);
        case ETHTOOL_GRXCLSRLALL:
-               ret = igc_get_ethtool_nfc_all(adapter, cmd, rule_locs);
-               break;
+               return igc_ethtool_get_nfc_rules(adapter, cmd, rule_locs);
        case ETHTOOL_GRXFH:
-               ret = igc_get_rss_hash_opts(adapter, cmd);
-               break;
+               return igc_ethtool_get_rss_hash_opts(adapter, cmd);
        default:
-               break;
+               return -EOPNOTSUPP;
        }
-
-       return ret;
 }
 
 #define UDP_RSS_FLAGS (IGC_FLAG_RSS_FIELD_IPV4_UDP | \
                       IGC_FLAG_RSS_FIELD_IPV6_UDP)
-static int igc_set_rss_hash_opt(struct igc_adapter *adapter,
-                               struct ethtool_rxnfc *nfc)
+static int igc_ethtool_set_rss_hash_opt(struct igc_adapter *adapter,
+                                       struct ethtool_rxnfc *nfc)
 {
        u32 flags = adapter->flags;
 
@@ -1186,252 +1183,185 @@ static int igc_set_rss_hash_opt(struct igc_adapter *adapter,
        return 0;
 }
 
-int igc_add_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input)
+static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule,
+                                     const struct ethtool_rx_flow_spec *fsp)
 {
-       struct igc_hw *hw = &adapter->hw;
-       int err = -EINVAL;
-
-       if (hw->mac.type == igc_i225 &&
-           !(input->filter.match_flags & ~IGC_FILTER_FLAG_SRC_MAC_ADDR)) {
-               netdev_err(adapter->netdev,
-                          "i225 doesn't support flow classification rules specifying only source addresses\n");
-               return -EOPNOTSUPP;
-       }
+       INIT_LIST_HEAD(&rule->list);
 
-       if (input->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
-               u16 etype = ntohs(input->filter.etype);
+       rule->action = fsp->ring_cookie;
+       rule->location = fsp->location;
 
-               err = igc_add_etype_filter(adapter, etype, input->action);
-               if (err)
-                       return err;
+       if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
+               rule->filter.vlan_tci = ntohs(fsp->h_ext.vlan_tci);
+               rule->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI;
        }
 
-       if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
-               err = igc_add_mac_filter(adapter, input->filter.dst_addr,
-                                        input->action, 0);
-               if (err)
-                       return err;
+       if (fsp->m_u.ether_spec.h_proto == ETHER_TYPE_FULL_MASK) {
+               rule->filter.etype = ntohs(fsp->h_u.ether_spec.h_proto);
+               rule->filter.match_flags = IGC_FILTER_FLAG_ETHER_TYPE;
        }
 
-       if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
-               err = igc_add_mac_filter(adapter, input->filter.src_addr,
-                                        input->action,
-                                        IGC_MAC_STATE_SRC_ADDR);
-               if (err)
-                       return err;
+       /* Both source and destination address filters only support the full
+        * mask.
+        */
+       if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
+               rule->filter.match_flags |= IGC_FILTER_FLAG_SRC_MAC_ADDR;
+               ether_addr_copy(rule->filter.src_addr,
+                               fsp->h_u.ether_spec.h_source);
        }
 
-       if (input->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
-               int prio = (ntohs(input->filter.vlan_tci) & VLAN_PRIO_MASK) >>
-                          VLAN_PRIO_SHIFT;
-               err = igc_add_vlan_prio_filter(adapter, prio, input->action);
-               if (err)
-                       return err;
+       if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
+               rule->filter.match_flags |= IGC_FILTER_FLAG_DST_MAC_ADDR;
+               ether_addr_copy(rule->filter.dst_addr,
+                               fsp->h_u.ether_spec.h_dest);
        }
-
-       return 0;
 }
 
-int igc_erase_filter(struct igc_adapter *adapter, struct igc_nfc_filter *input)
+/**
+ * igc_ethtool_check_nfc_rule() - Check if NFC rule is valid
+ * @adapter: Pointer to adapter
+ * @rule: Rule under evaluation
+ *
+ * Rules with both destination and source MAC addresses are considered invalid
+ * since the driver doesn't support them.
+ *
+ * Also, if there is already another rule with the same filter in a different
+ * location, @rule is considered invalid.
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ *
+ * Return: 0 in case of success, negative errno code otherwise.
+ */
+static int igc_ethtool_check_nfc_rule(struct igc_adapter *adapter,
+                                     struct igc_nfc_rule *rule)
 {
-       if (input->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
-               u16 etype = ntohs(input->filter.etype);
-
-               igc_del_etype_filter(adapter, etype);
-       }
+       struct net_device *dev = adapter->netdev;
+       u8 flags = rule->filter.match_flags;
+       struct igc_nfc_rule *tmp;
 
-       if (input->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
-               int prio = (ntohs(input->filter.vlan_tci) & VLAN_PRIO_MASK) >>
-                          VLAN_PRIO_SHIFT;
-               igc_del_vlan_prio_filter(adapter, prio);
+       if (!flags) {
+               netdev_dbg(dev, "Rule with no match\n");
+               return -EINVAL;
        }
 
-       if (input->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR)
-               igc_del_mac_filter(adapter, input->filter.src_addr,
-                                  IGC_MAC_STATE_SRC_ADDR);
-
-       if (input->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR)
-               igc_del_mac_filter(adapter, input->filter.dst_addr, 0);
-
-       return 0;
-}
-
-static int igc_update_ethtool_nfc_entry(struct igc_adapter *adapter,
-                                       struct igc_nfc_filter *input,
-                                       u16 sw_idx)
-{
-       struct igc_nfc_filter *rule, *parent;
-       int err = -EINVAL;
-
-       parent = NULL;
-       rule = NULL;
-
-       hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
-               /* hash found, or no matching entry */
-               if (rule->sw_idx >= sw_idx)
-                       break;
-               parent = rule;
+       if (flags & IGC_FILTER_FLAG_DST_MAC_ADDR &&
+           flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
+               netdev_dbg(dev, "Filters with both dst and src are not supported\n");
+               return -EOPNOTSUPP;
        }
 
-       /* if there is an old rule occupying our place remove it */
-       if (rule && rule->sw_idx == sw_idx) {
-               if (!input)
-                       err = igc_erase_filter(adapter, rule);
-
-               hlist_del(&rule->nfc_node);
-               kfree(rule);
-               adapter->nfc_filter_count--;
+       list_for_each_entry(tmp, &adapter->nfc_rule_list, list) {
+               if (!memcmp(&rule->filter, &tmp->filter,
+                           sizeof(rule->filter)) &&
+                   tmp->location != rule->location) {
+                       netdev_dbg(dev, "Rule already exists\n");
+                       return -EEXIST;
+               }
        }
 
-       /* If no input this was a delete, err should be 0 if a rule was
-        * successfully found and removed from the list else -EINVAL
-        */
-       if (!input)
-               return err;
-
-       /* initialize node */
-       INIT_HLIST_NODE(&input->nfc_node);
-
-       /* add filter to the list */
-       if (parent)
-               hlist_add_behind(&input->nfc_node, &parent->nfc_node);
-       else
-               hlist_add_head(&input->nfc_node, &adapter->nfc_filter_list);
-
-       /* update counts */
-       adapter->nfc_filter_count++;
-
        return 0;
 }
 
-static int igc_add_ethtool_nfc_entry(struct igc_adapter *adapter,
-                                    struct ethtool_rxnfc *cmd)
+static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter,
+                                   struct ethtool_rxnfc *cmd)
 {
        struct net_device *netdev = adapter->netdev;
        struct ethtool_rx_flow_spec *fsp =
                (struct ethtool_rx_flow_spec *)&cmd->fs;
-       struct igc_nfc_filter *input, *rule;
-       int err = 0;
+       struct igc_nfc_rule *rule, *old_rule;
+       int err;
 
-       if (!(netdev->hw_features & NETIF_F_NTUPLE))
+       if (!(netdev->hw_features & NETIF_F_NTUPLE)) {
+               netdev_dbg(netdev, "N-tuple filters disabled\n");
                return -EOPNOTSUPP;
+       }
 
-       /* Don't allow programming if the action is a queue greater than
-        * the number of online Rx queues.
-        */
-       if (fsp->ring_cookie == RX_CLS_FLOW_DISC ||
-           fsp->ring_cookie >= adapter->num_rx_queues) {
-               netdev_err(netdev,
-                          "ethtool -N: The specified action is invalid\n");
-               return -EINVAL;
+       if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW) {
+               netdev_dbg(netdev, "Only ethernet flow type is supported\n");
+               return -EOPNOTSUPP;
        }
 
-       /* Don't allow indexes to exist outside of available space */
-       if (fsp->location >= IGC_MAX_RXNFC_FILTERS) {
-               netdev_err(netdev, "Location out of range\n");
-               return -EINVAL;
+       if ((fsp->flow_type & FLOW_EXT) &&
+           fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
+               netdev_dbg(netdev, "VLAN mask not supported\n");
+               return -EOPNOTSUPP;
        }
 
-       if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW)
+       if (fsp->ring_cookie >= adapter->num_rx_queues) {
+               netdev_dbg(netdev, "Invalid action\n");
                return -EINVAL;
-
-       input = kzalloc(sizeof(*input), GFP_KERNEL);
-       if (!input)
-               return -ENOMEM;
-
-       if (fsp->m_u.ether_spec.h_proto == ETHER_TYPE_FULL_MASK) {
-               input->filter.etype = fsp->h_u.ether_spec.h_proto;
-               input->filter.match_flags = IGC_FILTER_FLAG_ETHER_TYPE;
        }
 
-       /* Only support matching addresses by the full mask */
-       if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
-               input->filter.match_flags |= IGC_FILTER_FLAG_SRC_MAC_ADDR;
-               ether_addr_copy(input->filter.src_addr,
-                               fsp->h_u.ether_spec.h_source);
+       if (fsp->location >= IGC_MAX_RXNFC_RULES) {
+               netdev_dbg(netdev, "Invalid location\n");
+               return -EINVAL;
        }
 
-       /* Only support matching addresses by the full mask */
-       if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
-               input->filter.match_flags |= IGC_FILTER_FLAG_DST_MAC_ADDR;
-               ether_addr_copy(input->filter.dst_addr,
-                               fsp->h_u.ether_spec.h_dest);
-       }
+       rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+       if (!rule)
+               return -ENOMEM;
 
-       if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
-               if (fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
-                       netdev_dbg(netdev, "VLAN mask not supported\n");
-                       err = -EOPNOTSUPP;
-                       goto err_out;
-               }
-               input->filter.vlan_tci = fsp->h_ext.vlan_tci;
-               input->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI;
-       }
+       igc_ethtool_init_nfc_rule(rule, fsp);
 
-       input->action = fsp->ring_cookie;
-       input->sw_idx = fsp->location;
+       mutex_lock(&adapter->nfc_rule_lock);
 
-       spin_lock(&adapter->nfc_lock);
+       err = igc_ethtool_check_nfc_rule(adapter, rule);
+       if (err)
+               goto err;
 
-       hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node) {
-               if (!memcmp(&input->filter, &rule->filter,
-                           sizeof(input->filter))) {
-                       err = -EEXIST;
-                       netdev_err(netdev,
-                                  "ethtool: this filter is already set\n");
-                       goto err_out_w_lock;
-               }
-       }
+       old_rule = igc_get_nfc_rule(adapter, fsp->location);
+       if (old_rule)
+               igc_del_nfc_rule(adapter, old_rule);
 
-       err = igc_add_filter(adapter, input);
+       err = igc_add_nfc_rule(adapter, rule);
        if (err)
-               goto err_out_w_lock;
-
-       igc_update_ethtool_nfc_entry(adapter, input, input->sw_idx);
+               goto err;
 
-       spin_unlock(&adapter->nfc_lock);
+       mutex_unlock(&adapter->nfc_rule_lock);
        return 0;
 
-err_out_w_lock:
-       spin_unlock(&adapter->nfc_lock);
-err_out:
-       kfree(input);
+err:
+       mutex_unlock(&adapter->nfc_rule_lock);
+       kfree(rule);
        return err;
 }
 
-static int igc_del_ethtool_nfc_entry(struct igc_adapter *adapter,
-                                    struct ethtool_rxnfc *cmd)
+static int igc_ethtool_del_nfc_rule(struct igc_adapter *adapter,
+                                   struct ethtool_rxnfc *cmd)
 {
        struct ethtool_rx_flow_spec *fsp =
                (struct ethtool_rx_flow_spec *)&cmd->fs;
-       int err;
+       struct igc_nfc_rule *rule;
 
-       spin_lock(&adapter->nfc_lock);
-       err = igc_update_ethtool_nfc_entry(adapter, NULL, fsp->location);
-       spin_unlock(&adapter->nfc_lock);
+       mutex_lock(&adapter->nfc_rule_lock);
 
-       return err;
+       rule = igc_get_nfc_rule(adapter, fsp->location);
+       if (!rule) {
+               mutex_unlock(&adapter->nfc_rule_lock);
+               return -EINVAL;
+       }
+
+       igc_del_nfc_rule(adapter, rule);
+
+       mutex_unlock(&adapter->nfc_rule_lock);
+       return 0;
 }
 
-static int igc_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+static int igc_ethtool_set_rxnfc(struct net_device *dev,
+                                struct ethtool_rxnfc *cmd)
 {
        struct igc_adapter *adapter = netdev_priv(dev);
-       int ret = -EOPNOTSUPP;
 
        switch (cmd->cmd) {
        case ETHTOOL_SRXFH:
-               ret = igc_set_rss_hash_opt(adapter, cmd);
-               break;
+               return igc_ethtool_set_rss_hash_opt(adapter, cmd);
        case ETHTOOL_SRXCLSRLINS:
-               ret = igc_add_ethtool_nfc_entry(adapter, cmd);
-               break;
+               return igc_ethtool_add_nfc_rule(adapter, cmd);
        case ETHTOOL_SRXCLSRLDEL:
-               ret = igc_del_ethtool_nfc_entry(adapter, cmd);
+               return igc_ethtool_del_nfc_rule(adapter, cmd);
        default:
-               break;
+               return -EOPNOTSUPP;
        }
-
-       return ret;
 }
 
 void igc_write_rss_indir_tbl(struct igc_adapter *adapter)
@@ -1456,13 +1386,13 @@ void igc_write_rss_indir_tbl(struct igc_adapter *adapter)
        }
 }
 
-static u32 igc_get_rxfh_indir_size(struct net_device *netdev)
+static u32 igc_ethtool_get_rxfh_indir_size(struct net_device *netdev)
 {
        return IGC_RETA_SIZE;
 }
 
-static int igc_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-                       u8 *hfunc)
+static int igc_ethtool_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+                               u8 *hfunc)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        int i;
@@ -1477,8 +1407,8 @@ static int igc_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
        return 0;
 }
 
-static int igc_set_rxfh(struct net_device *netdev, const u32 *indir,
-                       const u8 *key, const u8 hfunc)
+static int igc_ethtool_set_rxfh(struct net_device *netdev, const u32 *indir,
+                               const u8 *key, const u8 hfunc)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        u32 num_queues;
@@ -1506,18 +1436,13 @@ static int igc_set_rxfh(struct net_device *netdev, const u32 *indir,
        return 0;
 }
 
-static unsigned int igc_max_channels(struct igc_adapter *adapter)
-{
-       return igc_get_max_rss_queues(adapter);
-}
-
-static void igc_get_channels(struct net_device *netdev,
-                            struct ethtool_channels *ch)
+static void igc_ethtool_get_channels(struct net_device *netdev,
+                                    struct ethtool_channels *ch)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
        /* Report maximum channels */
-       ch->max_combined = igc_max_channels(adapter);
+       ch->max_combined = igc_get_max_rss_queues(adapter);
 
        /* Report info for other vector */
        if (adapter->flags & IGC_FLAG_HAS_MSIX) {
@@ -1528,8 +1453,8 @@ static void igc_get_channels(struct net_device *netdev,
        ch->combined_count = adapter->rss_queues;
 }
 
-static int igc_set_channels(struct net_device *netdev,
-                           struct ethtool_channels *ch)
+static int igc_ethtool_set_channels(struct net_device *netdev,
+                                   struct ethtool_channels *ch)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        unsigned int count = ch->combined_count;
@@ -1544,7 +1469,7 @@ static int igc_set_channels(struct net_device *netdev,
                return -EINVAL;
 
        /* Verify the number of channels doesn't exceed hw limits */
-       max_combined = igc_max_channels(adapter);
+       max_combined = igc_get_max_rss_queues(adapter);
        if (count > max_combined)
                return -EINVAL;
 
@@ -1561,8 +1486,8 @@ static int igc_set_channels(struct net_device *netdev,
        return 0;
 }
 
-static int igc_get_ts_info(struct net_device *dev,
-                          struct ethtool_ts_info *info)
+static int igc_ethtool_get_ts_info(struct net_device *dev,
+                                  struct ethtool_ts_info *info)
 {
        struct igc_adapter *adapter = netdev_priv(dev);
 
@@ -1594,7 +1519,7 @@ static int igc_get_ts_info(struct net_device *dev,
        }
 }
 
-static u32 igc_get_priv_flags(struct net_device *netdev)
+static u32 igc_ethtool_get_priv_flags(struct net_device *netdev)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        u32 priv_flags = 0;
@@ -1605,7 +1530,7 @@ static u32 igc_get_priv_flags(struct net_device *netdev)
        return priv_flags;
 }
 
-static int igc_set_priv_flags(struct net_device *netdev, u32 priv_flags)
+static int igc_ethtool_set_priv_flags(struct net_device *netdev, u32 priv_flags)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        unsigned int flags = adapter->flags;
@@ -1640,8 +1565,8 @@ static void igc_ethtool_complete(struct net_device *netdev)
        pm_runtime_put(&adapter->pdev->dev);
 }
 
-static int igc_get_link_ksettings(struct net_device *netdev,
-                                 struct ethtool_link_ksettings *cmd)
+static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
+                                         struct ethtool_link_ksettings *cmd)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct igc_hw *hw = &adapter->hw;
@@ -1747,8 +1672,9 @@ static int igc_get_link_ksettings(struct net_device *netdev,
        return 0;
 }
 
-static int igc_set_link_ksettings(struct net_device *netdev,
-                                 const struct ethtool_link_ksettings *cmd)
+static int
+igc_ethtool_set_link_ksettings(struct net_device *netdev,
+                              const struct ethtool_link_ksettings *cmd)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        struct net_device *dev = adapter->netdev;
@@ -1814,8 +1740,8 @@ static int igc_set_link_ksettings(struct net_device *netdev,
        return 0;
 }
 
-static void igc_diag_test(struct net_device *netdev,
-                         struct ethtool_test *eth_test, u64 *data)
+static void igc_ethtool_diag_test(struct net_device *netdev,
+                                 struct ethtool_test *eth_test, u64 *data)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
        bool if_running = netif_running(netdev);
@@ -1874,45 +1800,45 @@ static void igc_diag_test(struct net_device *netdev,
 
 static const struct ethtool_ops igc_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_USECS,
-       .get_drvinfo            = igc_get_drvinfo,
-       .get_regs_len           = igc_get_regs_len,
-       .get_regs               = igc_get_regs,
-       .get_wol                = igc_get_wol,
-       .set_wol                = igc_set_wol,
-       .get_msglevel           = igc_get_msglevel,
-       .set_msglevel           = igc_set_msglevel,
-       .nway_reset             = igc_nway_reset,
-       .get_link               = igc_get_link,
-       .get_eeprom_len         = igc_get_eeprom_len,
-       .get_eeprom             = igc_get_eeprom,
-       .set_eeprom             = igc_set_eeprom,
-       .get_ringparam          = igc_get_ringparam,
-       .set_ringparam          = igc_set_ringparam,
-       .get_pauseparam         = igc_get_pauseparam,
-       .set_pauseparam         = igc_set_pauseparam,
-       .get_strings            = igc_get_strings,
-       .get_sset_count         = igc_get_sset_count,
-       .get_ethtool_stats      = igc_get_ethtool_stats,
-       .get_coalesce           = igc_get_coalesce,
-       .set_coalesce           = igc_set_coalesce,
-       .get_rxnfc              = igc_get_rxnfc,
-       .set_rxnfc              = igc_set_rxnfc,
-       .get_rxfh_indir_size    = igc_get_rxfh_indir_size,
-       .get_rxfh               = igc_get_rxfh,
-       .set_rxfh               = igc_set_rxfh,
-       .get_ts_info            = igc_get_ts_info,
-       .get_channels           = igc_get_channels,
-       .set_channels           = igc_set_channels,
-       .get_priv_flags         = igc_get_priv_flags,
-       .set_priv_flags         = igc_set_priv_flags,
+       .get_drvinfo            = igc_ethtool_get_drvinfo,
+       .get_regs_len           = igc_ethtool_get_regs_len,
+       .get_regs               = igc_ethtool_get_regs,
+       .get_wol                = igc_ethtool_get_wol,
+       .set_wol                = igc_ethtool_set_wol,
+       .get_msglevel           = igc_ethtool_get_msglevel,
+       .set_msglevel           = igc_ethtool_set_msglevel,
+       .nway_reset             = igc_ethtool_nway_reset,
+       .get_link               = igc_ethtool_get_link,
+       .get_eeprom_len         = igc_ethtool_get_eeprom_len,
+       .get_eeprom             = igc_ethtool_get_eeprom,
+       .set_eeprom             = igc_ethtool_set_eeprom,
+       .get_ringparam          = igc_ethtool_get_ringparam,
+       .set_ringparam          = igc_ethtool_set_ringparam,
+       .get_pauseparam         = igc_ethtool_get_pauseparam,
+       .set_pauseparam         = igc_ethtool_set_pauseparam,
+       .get_strings            = igc_ethtool_get_strings,
+       .get_sset_count         = igc_ethtool_get_sset_count,
+       .get_ethtool_stats      = igc_ethtool_get_stats,
+       .get_coalesce           = igc_ethtool_get_coalesce,
+       .set_coalesce           = igc_ethtool_set_coalesce,
+       .get_rxnfc              = igc_ethtool_get_rxnfc,
+       .set_rxnfc              = igc_ethtool_set_rxnfc,
+       .get_rxfh_indir_size    = igc_ethtool_get_rxfh_indir_size,
+       .get_rxfh               = igc_ethtool_get_rxfh,
+       .set_rxfh               = igc_ethtool_set_rxfh,
+       .get_ts_info            = igc_ethtool_get_ts_info,
+       .get_channels           = igc_ethtool_get_channels,
+       .set_channels           = igc_ethtool_set_channels,
+       .get_priv_flags         = igc_ethtool_get_priv_flags,
+       .set_priv_flags         = igc_ethtool_set_priv_flags,
        .begin                  = igc_ethtool_begin,
        .complete               = igc_ethtool_complete,
-       .get_link_ksettings     = igc_get_link_ksettings,
-       .set_link_ksettings     = igc_set_link_ksettings,
-       .self_test              = igc_diag_test,
+       .get_link_ksettings     = igc_ethtool_get_link_ksettings,
+       .set_link_ksettings     = igc_ethtool_set_link_ksettings,
+       .self_test              = igc_ethtool_diag_test,
 };
 
-void igc_set_ethtool_ops(struct net_device *netdev)
+void igc_ethtool_set_ops(struct net_device *netdev)
 {
        netdev->ethtool_ops = &igc_ethtool_ops;
 }
index 12aa6b5fcb5d9543005b1560350956e7749c66eb..89445ab02a986faed5c1cd5d891bb7df2a4d48f3 100644 (file)
@@ -307,12 +307,8 @@ void igc_clear_hw_cntrs_base(struct igc_hw *hw)
        rd32(IGC_ICTXQMTC);
        rd32(IGC_ICRXDMTC);
 
-       rd32(IGC_CBTMPC);
-       rd32(IGC_HTDPMC);
-       rd32(IGC_CBRMPC);
        rd32(IGC_RPTHC);
        rd32(IGC_HGPTC);
-       rd32(IGC_HTCBDPC);
        rd32(IGC_HGORCL);
        rd32(IGC_HGORCH);
        rd32(IGC_HGOTCL);
index 0df5617eb9d0908d8aa996179e75485fc026918f..97d26991c87e92168eb1c71cd270bef8c470c0be 100644 (file)
@@ -766,12 +766,14 @@ static void igc_setup_tctl(struct igc_adapter *adapter)
  * igc_set_mac_filter_hw() - Set MAC address filter in hardware
  * @adapter: Pointer to adapter where the filter should be set
  * @index: Filter index
- * @addr: Destination MAC address
+ * @type: MAC address filter type (source or destination)
+ * @addr: MAC address
  * @queue: If non-negative, queue assignment feature is enabled and frames
  *         matching the filter are enqueued onto 'queue'. Otherwise, queue
  *         assignment is disabled.
  */
 static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index,
+                                 enum igc_mac_filter_type type,
                                  const u8 *addr, int queue)
 {
        struct net_device *dev = adapter->netdev;
@@ -784,6 +786,11 @@ static void igc_set_mac_filter_hw(struct igc_adapter *adapter, int index,
        ral = le32_to_cpup((__le32 *)(addr));
        rah = le16_to_cpup((__le16 *)(addr + 4));
 
+       if (type == IGC_MAC_FILTER_TYPE_SRC) {
+               rah &= ~IGC_RAH_ASEL_MASK;
+               rah |= IGC_RAH_ASEL_SRC_ADDR;
+       }
+
        if (queue >= 0) {
                rah &= ~IGC_RAH_QSEL_MASK;
                rah |= (queue << IGC_RAH_QSEL_SHIFT);
@@ -820,17 +827,12 @@ static void igc_clear_mac_filter_hw(struct igc_adapter *adapter, int index)
 /* Set default MAC address for the PF in the first RAR entry */
 static void igc_set_default_mac_filter(struct igc_adapter *adapter)
 {
-       struct igc_mac_addr *mac_table = &adapter->mac_table[0];
        struct net_device *dev = adapter->netdev;
        u8 *addr = adapter->hw.mac.addr;
 
        netdev_dbg(dev, "Set default MAC address filter: address %pM", addr);
 
-       ether_addr_copy(mac_table->addr, addr);
-       mac_table->state = IGC_MAC_STATE_DEFAULT | IGC_MAC_STATE_IN_USE;
-       mac_table->queue = -1;
-
-       igc_set_mac_filter_hw(adapter, 0, addr, mac_table->queue);
+       igc_set_mac_filter_hw(adapter, 0, IGC_MAC_FILTER_TYPE_DST, addr, -1);
 }
 
 /**
@@ -2172,34 +2174,26 @@ static bool igc_clean_tx_irq(struct igc_q_vector *q_vector, int napi_budget)
        return !!budget;
 }
 
-static void igc_nfc_filter_restore(struct igc_adapter *adapter)
-{
-       struct igc_nfc_filter *rule;
-
-       spin_lock(&adapter->nfc_lock);
-
-       hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node)
-               igc_add_filter(adapter, rule);
-
-       spin_unlock(&adapter->nfc_lock);
-}
-
-static int igc_find_mac_filter(struct igc_adapter *adapter, const u8 *addr,
-                              u8 flags)
+static int igc_find_mac_filter(struct igc_adapter *adapter,
+                              enum igc_mac_filter_type type, const u8 *addr)
 {
-       int max_entries = adapter->hw.mac.rar_entry_count;
-       struct igc_mac_addr *entry;
+       struct igc_hw *hw = &adapter->hw;
+       int max_entries = hw->mac.rar_entry_count;
+       u32 ral, rah;
        int i;
 
        for (i = 0; i < max_entries; i++) {
-               entry = &adapter->mac_table[i];
+               ral = rd32(IGC_RAL(i));
+               rah = rd32(IGC_RAH(i));
 
-               if (!(entry->state & IGC_MAC_STATE_IN_USE))
+               if (!(rah & IGC_RAH_AV))
                        continue;
-               if (!ether_addr_equal(addr, entry->addr))
+               if (!!(rah & IGC_RAH_ASEL_SRC_ADDR) != type)
                        continue;
-               if ((entry->state & IGC_MAC_STATE_SRC_ADDR) !=
-                   (flags & IGC_MAC_STATE_SRC_ADDR))
+               if ((rah & IGC_RAH_RAH_MASK) !=
+                   le16_to_cpup((__le16 *)(addr + 4)))
+                       continue;
+               if (ral != le32_to_cpup((__le32 *)(addr)))
                        continue;
 
                return i;
@@ -2210,14 +2204,15 @@ static int igc_find_mac_filter(struct igc_adapter *adapter, const u8 *addr,
 
 static int igc_get_avail_mac_filter_slot(struct igc_adapter *adapter)
 {
-       int max_entries = adapter->hw.mac.rar_entry_count;
-       struct igc_mac_addr *entry;
+       struct igc_hw *hw = &adapter->hw;
+       int max_entries = hw->mac.rar_entry_count;
+       u32 rah;
        int i;
 
        for (i = 0; i < max_entries; i++) {
-               entry = &adapter->mac_table[i];
+               rah = rd32(IGC_RAH(i));
 
-               if (!(entry->state & IGC_MAC_STATE_IN_USE))
+               if (!(rah & IGC_RAH_AV))
                        return i;
        }
 
@@ -2227,91 +2222,70 @@ static int igc_get_avail_mac_filter_slot(struct igc_adapter *adapter)
 /**
  * igc_add_mac_filter() - Add MAC address filter
  * @adapter: Pointer to adapter where the filter should be added
+ * @type: MAC address filter type (source or destination)
  * @addr: MAC address
  * @queue: If non-negative, queue assignment feature is enabled and frames
  *         matching the filter are enqueued onto 'queue'. Otherwise, queue
  *         assignment is disabled.
- * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source
- *         address
  *
  * Return: 0 in case of success, negative errno code otherwise.
  */
-int igc_add_mac_filter(struct igc_adapter *adapter, const u8 *addr,
-                      const s8 queue, const u8 flags)
+static int igc_add_mac_filter(struct igc_adapter *adapter,
+                             enum igc_mac_filter_type type, const u8 *addr,
+                             int queue)
 {
        struct net_device *dev = adapter->netdev;
        int index;
 
-       if (!is_valid_ether_addr(addr))
-               return -EINVAL;
-       if (flags & IGC_MAC_STATE_SRC_ADDR)
-               return -ENOTSUPP;
-
-       index = igc_find_mac_filter(adapter, addr, flags);
+       index = igc_find_mac_filter(adapter, type, addr);
        if (index >= 0)
-               goto update_queue_assignment;
+               goto update_filter;
 
        index = igc_get_avail_mac_filter_slot(adapter);
        if (index < 0)
                return -ENOSPC;
 
-       netdev_dbg(dev, "Add MAC address filter: index %d address %pM queue %d",
-                  index, addr, queue);
+       netdev_dbg(dev, "Add MAC address filter: index %d type %s address %pM queue %d\n",
+                  index, type == IGC_MAC_FILTER_TYPE_DST ? "dst" : "src",
+                  addr, queue);
 
-       ether_addr_copy(adapter->mac_table[index].addr, addr);
-       adapter->mac_table[index].state |= IGC_MAC_STATE_IN_USE | flags;
-update_queue_assignment:
-       adapter->mac_table[index].queue = queue;
-
-       igc_set_mac_filter_hw(adapter, index, addr, queue);
+update_filter:
+       igc_set_mac_filter_hw(adapter, index, type, addr, queue);
        return 0;
 }
 
 /**
  * igc_del_mac_filter() - Delete MAC address filter
  * @adapter: Pointer to adapter where the filter should be deleted from
+ * @type: MAC address filter type (source or destination)
  * @addr: MAC address
- * @flags: Set IGC_MAC_STATE_SRC_ADDR bit to indicate @address is a source
- *         address
- *
- * Return: 0 in case of success, negative errno code otherwise.
  */
-int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr,
-                      const u8 flags)
+static void igc_del_mac_filter(struct igc_adapter *adapter,
+                              enum igc_mac_filter_type type, const u8 *addr)
 {
        struct net_device *dev = adapter->netdev;
-       struct igc_mac_addr *entry;
        int index;
 
-       if (!is_valid_ether_addr(addr))
-               return -EINVAL;
-
-       index = igc_find_mac_filter(adapter, addr, flags);
+       index = igc_find_mac_filter(adapter, type, addr);
        if (index < 0)
-               return -ENOENT;
-
-       entry = &adapter->mac_table[index];
+               return;
 
-       if (entry->state & IGC_MAC_STATE_DEFAULT) {
+       if (index == 0) {
                /* If this is the default filter, we don't actually delete it.
                 * We just reset to its default value i.e. disable queue
                 * assignment.
                 */
                netdev_dbg(dev, "Disable default MAC filter queue assignment");
 
-               entry->queue = -1;
-               igc_set_mac_filter_hw(adapter, 0, addr, entry->queue);
+               igc_set_mac_filter_hw(adapter, 0, type, addr, -1);
        } else {
-               netdev_dbg(dev, "Delete MAC address filter: index %d address %pM",
-                          index, addr);
+               netdev_dbg(dev, "Delete MAC address filter: index %d type %s address %pM\n",
+                          index,
+                          type == IGC_MAC_FILTER_TYPE_DST ? "dst" : "src",
+                          addr);
 
-               entry->state = 0;
-               entry->queue = -1;
-               memset(entry->addr, 0, ETH_ALEN);
                igc_clear_mac_filter_hw(adapter, index);
        }
-
-       return 0;
 }
 
 /**
@@ -2322,7 +2296,8 @@ int igc_del_mac_filter(struct igc_adapter *adapter, const u8 *addr,
  *
  * Return: 0 in case of success, negative errno code otherwise.
  */
-int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio, int queue)
+static int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio,
+                                   int queue)
 {
        struct net_device *dev = adapter->netdev;
        struct igc_hw *hw = &adapter->hw;
@@ -2350,7 +2325,7 @@ int igc_add_vlan_prio_filter(struct igc_adapter *adapter, int prio, int queue)
  * @adapter: Pointer to adapter where the filter should be deleted from
  * @prio: VLAN priority value
  */
-void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio)
+static void igc_del_vlan_prio_filter(struct igc_adapter *adapter, int prio)
 {
        struct igc_hw *hw = &adapter->hw;
        u32 vlanpqf;
@@ -2391,7 +2366,8 @@ static int igc_get_avail_etype_filter_slot(struct igc_adapter *adapter)
  *
  * Return: 0 in case of success, negative errno code otherwise.
  */
-int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype, int queue)
+static int igc_add_etype_filter(struct igc_adapter *adapter, u16 etype,
+                               int queue)
 {
        struct igc_hw *hw = &adapter->hw;
        int index;
@@ -2440,37 +2416,194 @@ static int igc_find_etype_filter(struct igc_adapter *adapter, u16 etype)
  * igc_del_etype_filter() - Delete ethertype filter
  * @adapter: Pointer to adapter where the filter should be deleted from
  * @etype: Ethertype value
- *
- * Return: 0 in case of success, negative errno code otherwise.
  */
-int igc_del_etype_filter(struct igc_adapter *adapter, u16 etype)
+static void igc_del_etype_filter(struct igc_adapter *adapter, u16 etype)
 {
        struct igc_hw *hw = &adapter->hw;
        int index;
 
        index = igc_find_etype_filter(adapter, etype);
        if (index < 0)
-               return -ENOENT;
+               return;
 
        wr32(IGC_ETQF(index), 0);
 
        netdev_dbg(adapter->netdev, "Delete ethertype filter: etype %04x\n",
                   etype);
+}
+
+static int igc_enable_nfc_rule(struct igc_adapter *adapter,
+                              const struct igc_nfc_rule *rule)
+{
+       int err;
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE) {
+               err = igc_add_etype_filter(adapter, rule->filter.etype,
+                                          rule->action);
+               if (err)
+                       return err;
+       }
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR) {
+               err = igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_SRC,
+                                        rule->filter.src_addr, rule->action);
+               if (err)
+                       return err;
+       }
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
+               err = igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST,
+                                        rule->filter.dst_addr, rule->action);
+               if (err)
+                       return err;
+       }
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+               int prio = (rule->filter.vlan_tci & VLAN_PRIO_MASK) >>
+                          VLAN_PRIO_SHIFT;
+
+               err = igc_add_vlan_prio_filter(adapter, prio, rule->action);
+               if (err)
+                       return err;
+       }
+
        return 0;
 }
 
+static void igc_disable_nfc_rule(struct igc_adapter *adapter,
+                                const struct igc_nfc_rule *rule)
+{
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_ETHER_TYPE)
+               igc_del_etype_filter(adapter, rule->filter.etype);
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+               int prio = (rule->filter.vlan_tci & VLAN_PRIO_MASK) >>
+                          VLAN_PRIO_SHIFT;
+
+               igc_del_vlan_prio_filter(adapter, prio);
+       }
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_SRC_MAC_ADDR)
+               igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_SRC,
+                                  rule->filter.src_addr);
+
+       if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR)
+               igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST,
+                                  rule->filter.dst_addr);
+}
+
+/**
+ * igc_get_nfc_rule() - Get NFC rule
+ * @adapter: Pointer to adapter
+ * @location: Rule location
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ *
+ * Return: Pointer to NFC rule at @location. If not found, NULL.
+ */
+struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter,
+                                     u32 location)
+{
+       struct igc_nfc_rule *rule;
+
+       list_for_each_entry(rule, &adapter->nfc_rule_list, list) {
+               if (rule->location == location)
+                       return rule;
+               if (rule->location > location)
+                       break;
+       }
+
+       return NULL;
+}
+
+/**
+ * igc_del_nfc_rule() - Delete NFC rule
+ * @adapter: Pointer to adapter
+ * @rule: Pointer to rule to be deleted
+ *
+ * Disable NFC rule in hardware and delete it from adapter.
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ */
+void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule)
+{
+       igc_disable_nfc_rule(adapter, rule);
+
+       list_del(&rule->list);
+       adapter->nfc_rule_count--;
+
+       kfree(rule);
+}
+
+static void igc_flush_nfc_rules(struct igc_adapter *adapter)
+{
+       struct igc_nfc_rule *rule, *tmp;
+
+       mutex_lock(&adapter->nfc_rule_lock);
+
+       list_for_each_entry_safe(rule, tmp, &adapter->nfc_rule_list, list)
+               igc_del_nfc_rule(adapter, rule);
+
+       mutex_unlock(&adapter->nfc_rule_lock);
+}
+
+/**
+ * igc_add_nfc_rule() - Add NFC rule
+ * @adapter: Pointer to adapter
+ * @rule: Pointer to rule to be added
+ *
+ * Enable NFC rule in hardware and add it to adapter.
+ *
+ * Context: Expects adapter->nfc_rule_lock to be held by caller.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule)
+{
+       struct igc_nfc_rule *pred, *cur;
+       int err;
+
+       err = igc_enable_nfc_rule(adapter, rule);
+       if (err)
+               return err;
+
+       pred = NULL;
+       list_for_each_entry(cur, &adapter->nfc_rule_list, list) {
+               if (cur->location >= rule->location)
+                       break;
+               pred = cur;
+       }
+
+       list_add(&rule->list, pred ? &pred->list : &adapter->nfc_rule_list);
+       adapter->nfc_rule_count++;
+       return 0;
+}
+
+static void igc_restore_nfc_rules(struct igc_adapter *adapter)
+{
+       struct igc_nfc_rule *rule;
+
+       mutex_lock(&adapter->nfc_rule_lock);
+
+       list_for_each_entry_reverse(rule, &adapter->nfc_rule_list, list)
+               igc_enable_nfc_rule(adapter, rule);
+
+       mutex_unlock(&adapter->nfc_rule_lock);
+}
+
 static int igc_uc_sync(struct net_device *netdev, const unsigned char *addr)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
-       return igc_add_mac_filter(adapter, addr, -1, 0);
+       return igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, addr, -1);
 }
 
 static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr)
 {
        struct igc_adapter *adapter = netdev_priv(netdev);
 
-       return igc_del_mac_filter(adapter, addr, 0);
+       igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, addr);
+       return 0;
 }
 
 /**
@@ -2541,7 +2674,7 @@ static void igc_configure(struct igc_adapter *adapter)
        igc_setup_rctl(adapter);
 
        igc_set_default_mac_filter(adapter);
-       igc_nfc_filter_restore(adapter);
+       igc_restore_nfc_rules(adapter);
 
        igc_configure_tx(adapter);
        igc_configure_rx(adapter);
@@ -2735,12 +2868,7 @@ void igc_set_flag_queue_pairs(struct igc_adapter *adapter,
 
 unsigned int igc_get_max_rss_queues(struct igc_adapter *adapter)
 {
-       unsigned int max_rss_queues;
-
-       /* Determine the maximum number of RSS queues supported. */
-       max_rss_queues = IGC_MAX_RX_QUEUES;
-
-       return max_rss_queues;
+       return IGC_MAX_RX_QUEUES;
 }
 
 static void igc_init_queue_configuration(struct igc_adapter *adapter)
@@ -3415,8 +3543,6 @@ static int igc_sw_init(struct igc_adapter *adapter)
        struct pci_dev *pdev = adapter->pdev;
        struct igc_hw *hw = &adapter->hw;
 
-       int size = sizeof(struct igc_mac_addr) * hw->mac.rar_entry_count;
-
        pci_read_config_word(pdev, PCI_COMMAND, &hw->bus.pci_cmd_word);
 
        /* set default ring sizes */
@@ -3435,15 +3561,14 @@ static int igc_sw_init(struct igc_adapter *adapter)
                                VLAN_HLEN;
        adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;
 
-       spin_lock_init(&adapter->nfc_lock);
+       mutex_init(&adapter->nfc_rule_lock);
+       INIT_LIST_HEAD(&adapter->nfc_rule_list);
+       adapter->nfc_rule_count = 0;
+
        spin_lock_init(&adapter->stats64_lock);
        /* Assume MSI-X interrupts, will be checked during IRQ allocation */
        adapter->flags |= IGC_FLAG_HAS_MSIX;
 
-       adapter->mac_table = kzalloc(size, GFP_ATOMIC);
-       if (!adapter->mac_table)
-               return -ENOMEM;
-
        igc_init_queue_configuration(adapter);
 
        /* This call may decrease the number of queues */
@@ -3666,18 +3791,6 @@ void igc_update_stats(struct igc_adapter *adapter)
        adapter->stats.mgpdc += rd32(IGC_MGTPDC);
 }
 
-static void igc_nfc_filter_exit(struct igc_adapter *adapter)
-{
-       struct igc_nfc_filter *rule;
-
-       spin_lock(&adapter->nfc_lock);
-
-       hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node)
-               igc_erase_filter(adapter, rule);
-
-       spin_unlock(&adapter->nfc_lock);
-}
-
 /**
  * igc_down - Close the interface
  * @adapter: board private structure
@@ -3696,8 +3809,6 @@ void igc_down(struct igc_adapter *adapter)
        wr32(IGC_RCTL, rctl & ~IGC_RCTL_EN);
        /* flush and sleep below */
 
-       igc_nfc_filter_exit(adapter);
-
        /* set trans_start so we don't get spurious watchdogs during reset */
        netif_trans_update(netdev);
 
@@ -3846,20 +3957,8 @@ static int igc_set_features(struct net_device *netdev,
        if (!(changed & (NETIF_F_RXALL | NETIF_F_NTUPLE)))
                return 0;
 
-       if (!(features & NETIF_F_NTUPLE)) {
-               struct hlist_node *node2;
-               struct igc_nfc_filter *rule;
-
-               spin_lock(&adapter->nfc_lock);
-               hlist_for_each_entry_safe(rule, node2,
-                                         &adapter->nfc_filter_list, nfc_node) {
-                       igc_erase_filter(adapter, rule);
-                       hlist_del(&rule->nfc_node);
-                       kfree(rule);
-               }
-               spin_unlock(&adapter->nfc_lock);
-               adapter->nfc_filter_count = 0;
-       }
+       if (!(features & NETIF_F_NTUPLE))
+               igc_flush_nfc_rules(adapter);
 
        netdev->features = features;
 
@@ -4947,7 +5046,7 @@ static int igc_probe(struct pci_dev *pdev,
        hw->hw_addr = adapter->io_addr;
 
        netdev->netdev_ops = &igc_netdev_ops;
-       igc_set_ethtool_ops(netdev);
+       igc_ethtool_set_ops(netdev);
        netdev->watchdog_timeo = 5 * HZ;
 
        netdev->mem_start = pci_resource_start(pdev, 0);
@@ -5126,6 +5225,8 @@ static void igc_remove(struct pci_dev *pdev)
 
        pm_runtime_get_noresume(&pdev->dev);
 
+       igc_flush_nfc_rules(adapter);
+
        igc_ptp_stop(adapter);
 
        set_bit(__IGC_DOWN, &adapter->state);
@@ -5146,7 +5247,6 @@ static void igc_remove(struct pci_dev *pdev)
        pci_iounmap(pdev, adapter->io_addr);
        pci_release_mem_regions(pdev);
 
-       kfree(adapter->mac_table);
        free_netdev(netdev);
 
        pci_disable_pcie_error_reporting(pdev);
index 61db951f0947b7f679ea2f00e113c62d0d28cc3a..7f999cfc9b3922937b8716d882e42535743f09fc 100644 (file)
 #define IGC_ICRXDMTC           0x04120  /* Rx Descriptor Min Threshold Count */
 #define IGC_ICRXOC             0x04124  /* Receiver Overrun Count */
 
-#define IGC_CBTMPC             0x0402C  /* Circuit Breaker TX Packet Count */
-#define IGC_HTDPMC             0x0403C  /* Host Transmit Discarded Packets */
-#define IGC_CBRMPC             0x040FC  /* Circuit Breaker RX Packet Count */
-#define IGC_RPTHC              0x04104  /* Rx Packets To Host */
-#define IGC_HGPTC              0x04118  /* Host Good Packets TX Count */
-#define IGC_HTCBDPC            0x04124  /* Host TX Circ.Breaker Drop Count */
-
 /* MSI-X Table Register Descriptions */
 #define IGC_PBACL              0x05B68  /* MSIx PBA Clear - R/W 1 to clear */
 
 #define IGC_MMDAC              13 /* MMD Access Control */
 #define IGC_MMDAAD             14 /* MMD Access Address/Data */
 
-/* Good transmitted packets counter registers */
-#define IGC_PQGPTC(_n)         (0x010014 + (0x100 * (_n)))
-
 /* Statistics Register Descriptions */
 #define IGC_CRCERRS    0x04000  /* CRC Error Count - R/clr */
 #define IGC_ALGNERRC   0x04004  /* Alignment Error Count - R/clr */
 #define IGC_HGOTCL     0x04130  /* Host Good Octets Transmit Count Low */
 #define IGC_HGOTCH     0x04134  /* Host Good Octets Transmit Count High */
 #define IGC_LENERRS    0x04138  /* Length Errors Count */
-#define IGC_HRMPC      0x0A018  /* Header Redirection Missed Packet Count */
 
 /* Time sync registers */
 #define IGC_TSICR      0x0B66C  /* Time Sync Interrupt Cause */
index 2833e4f041ce066608342074ebd9756da070551e..5ddfc83a1e46e82eb54ea5daef4b8acd15f36f83 100644 (file)
@@ -224,17 +224,17 @@ struct ixgbe_tx_buffer {
 };
 
 struct ixgbe_rx_buffer {
-       struct sk_buff *skb;
-       dma_addr_t dma;
        union {
                struct {
+                       struct sk_buff *skb;
+                       dma_addr_t dma;
                        struct page *page;
                        __u32 page_offset;
                        __u16 pagecnt_bias;
                };
                struct {
-                       void *addr;
-                       u64 handle;
+                       bool discard;
+                       struct xdp_buff *xdp;
                };
        };
 };
@@ -351,7 +351,6 @@ struct ixgbe_ring {
        };
        struct xdp_rxq_info xdp_rxq;
        struct xdp_umem *xsk_umem;
-       struct zero_copy_allocator zca; /* ZC allocator anchor */
        u16 ring_idx;           /* {rx,tx,xdp}_ring back reference idx */
        u16 rx_buf_len;
 } ____cacheline_internodealigned_in_smp;
index eab5934b04f55015f1acf8d4785e97a75027ff77..45fc7ce1a54344bff6b1a7a9a9ff7f2af094ab32 100644 (file)
@@ -35,7 +35,7 @@
 #include <net/tc_act/tc_mirred.h>
 #include <net/vxlan.h>
 #include <net/mpls.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xfrm.h>
 
 #include "ixgbe.h"
@@ -3745,8 +3745,7 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter,
 
        /* configure the packet buffer length */
        if (rx_ring->xsk_umem) {
-               u32 xsk_buf_len = rx_ring->xsk_umem->chunk_size_nohr -
-                                 XDP_PACKET_HEADROOM;
+               u32 xsk_buf_len = xsk_umem_get_rx_frame_size(rx_ring->xsk_umem);
 
                /* If the MAC support setting RXDCTL.RLPML, the
                 * SRRCTL[n].BSIZEPKT is set to PAGE_SIZE and
@@ -4093,11 +4092,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
        xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
        ring->xsk_umem = ixgbe_xsk_umem(adapter, ring);
        if (ring->xsk_umem) {
-               ring->zca.free = ixgbe_zca_free;
                WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
-                                                  MEM_TYPE_ZERO_COPY,
-                                                  &ring->zca));
-
+                                                  MEM_TYPE_XSK_BUFF_POOL,
+                                                  NULL));
+               xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
        } else {
                WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
                                                   MEM_TYPE_PAGE_SHARED, NULL));
@@ -4153,8 +4151,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
        }
 
        if (ring->xsk_umem && hw->mac.type != ixgbe_mac_82599EB) {
-               u32 xsk_buf_len = ring->xsk_umem->chunk_size_nohr -
-                                 XDP_PACKET_HEADROOM;
+               u32 xsk_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
 
                rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
                            IXGBE_RXDCTL_RLPML_EN);
index 6d01700b46bc3d8b42886f98931117456a65a783..7887ae4aaf4f5bfb6302f55e4cf62f7f9499efec 100644 (file)
@@ -35,7 +35,7 @@ int ixgbe_xsk_umem_setup(struct ixgbe_adapter *adapter, struct xdp_umem *umem,
 
 void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
 
-void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
+bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
 int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
                          struct ixgbe_ring *rx_ring,
                          const int budget);
index a656ee9a1faeab805f92c125743a5e505ff52405..86add9fbd36c46b7fb61363ee175fb9fbb9479f5 100644 (file)
@@ -2,7 +2,7 @@
 /* Copyright(c) 2018 Intel Corporation. */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 
 #include "ixgbe.h"
@@ -20,54 +20,11 @@ struct xdp_umem *ixgbe_xsk_umem(struct ixgbe_adapter *adapter,
        return xdp_get_umem_from_qid(adapter->netdev, qid);
 }
 
-static int ixgbe_xsk_umem_dma_map(struct ixgbe_adapter *adapter,
-                                 struct xdp_umem *umem)
-{
-       struct device *dev = &adapter->pdev->dev;
-       unsigned int i, j;
-       dma_addr_t dma;
-
-       for (i = 0; i < umem->npgs; i++) {
-               dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
-                                        DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
-               if (dma_mapping_error(dev, dma))
-                       goto out_unmap;
-
-               umem->pages[i].dma = dma;
-       }
-
-       return 0;
-
-out_unmap:
-       for (j = 0; j < i; j++) {
-               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
-                                    DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
-               umem->pages[i].dma = 0;
-       }
-
-       return -1;
-}
-
-static void ixgbe_xsk_umem_dma_unmap(struct ixgbe_adapter *adapter,
-                                    struct xdp_umem *umem)
-{
-       struct device *dev = &adapter->pdev->dev;
-       unsigned int i;
-
-       for (i = 0; i < umem->npgs; i++) {
-               dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
-                                    DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
-
-               umem->pages[i].dma = 0;
-       }
-}
-
 static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
                                 struct xdp_umem *umem,
                                 u16 qid)
 {
        struct net_device *netdev = adapter->netdev;
-       struct xdp_umem_fq_reuse *reuseq;
        bool if_running;
        int err;
 
@@ -78,13 +35,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
            qid >= netdev->real_num_tx_queues)
                return -EINVAL;
 
-       reuseq = xsk_reuseq_prepare(adapter->rx_ring[0]->count);
-       if (!reuseq)
-               return -ENOMEM;
-
-       xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
-       err = ixgbe_xsk_umem_dma_map(adapter, umem);
+       err = xsk_buff_dma_map(umem, &adapter->pdev->dev, IXGBE_RX_DMA_ATTR);
        if (err)
                return err;
 
@@ -124,7 +75,7 @@ static int ixgbe_xsk_umem_disable(struct ixgbe_adapter *adapter, u16 qid)
                ixgbe_txrx_ring_disable(adapter, qid);
 
        clear_bit(qid, adapter->af_xdp_zc_qps);
-       ixgbe_xsk_umem_dma_unmap(adapter, umem);
+       xsk_buff_dma_unmap(umem, IXGBE_RX_DMA_ATTR);
 
        if (if_running)
                ixgbe_txrx_ring_enable(adapter, qid);
@@ -143,19 +94,14 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
                            struct ixgbe_ring *rx_ring,
                            struct xdp_buff *xdp)
 {
-       struct xdp_umem *umem = rx_ring->xsk_umem;
        int err, result = IXGBE_XDP_PASS;
        struct bpf_prog *xdp_prog;
        struct xdp_frame *xdpf;
-       u64 offset;
        u32 act;
 
        rcu_read_lock();
        xdp_prog = READ_ONCE(rx_ring->xdp_prog);
        act = bpf_prog_run_xdp(xdp_prog, xdp);
-       offset = xdp->data - xdp->data_hard_start;
-
-       xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
 
        switch (act) {
        case XDP_PASS:
@@ -186,140 +132,16 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
        return result;
 }
 
-static struct
-ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
-                                       unsigned int size)
-{
-       struct ixgbe_rx_buffer *bi;
-
-       bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
-
-       /* we are reusing so sync this buffer for CPU use */
-       dma_sync_single_range_for_cpu(rx_ring->dev,
-                                     bi->dma, 0,
-                                     size,
-                                     DMA_BIDIRECTIONAL);
-
-       return bi;
-}
-
-static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
-                                    struct ixgbe_rx_buffer *obi)
-{
-       u16 nta = rx_ring->next_to_alloc;
-       struct ixgbe_rx_buffer *nbi;
-
-       nbi = &rx_ring->rx_buffer_info[rx_ring->next_to_alloc];
-       /* update, and store next to alloc */
-       nta++;
-       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-       /* transfer page from old buffer to new buffer */
-       nbi->dma = obi->dma;
-       nbi->addr = obi->addr;
-       nbi->handle = obi->handle;
-
-       obi->addr = NULL;
-       obi->skb = NULL;
-}
-
-void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
-{
-       struct ixgbe_rx_buffer *bi;
-       struct ixgbe_ring *rx_ring;
-       u64 hr, mask;
-       u16 nta;
-
-       rx_ring = container_of(alloc, struct ixgbe_ring, zca);
-       hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
-       mask = rx_ring->xsk_umem->chunk_mask;
-
-       nta = rx_ring->next_to_alloc;
-       bi = rx_ring->rx_buffer_info;
-
-       nta++;
-       rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-       handle &= mask;
-
-       bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
-       bi->dma += hr;
-
-       bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
-       bi->addr += hr;
-
-       bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
-                                           rx_ring->xsk_umem->headroom);
-}
-
-static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
-                                 struct ixgbe_rx_buffer *bi)
-{
-       struct xdp_umem *umem = rx_ring->xsk_umem;
-       void *addr = bi->addr;
-       u64 handle, hr;
-
-       if (addr)
-               return true;
-
-       if (!xsk_umem_peek_addr(umem, &handle)) {
-               rx_ring->rx_stats.alloc_rx_page_failed++;
-               return false;
-       }
-
-       hr = umem->headroom + XDP_PACKET_HEADROOM;
-
-       bi->dma = xdp_umem_get_dma(umem, handle);
-       bi->dma += hr;
-
-       bi->addr = xdp_umem_get_data(umem, handle);
-       bi->addr += hr;
-
-       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
-       xsk_umem_release_addr(umem);
-       return true;
-}
-
-static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
-                                      struct ixgbe_rx_buffer *bi)
-{
-       struct xdp_umem *umem = rx_ring->xsk_umem;
-       u64 handle, hr;
-
-       if (!xsk_umem_peek_addr_rq(umem, &handle)) {
-               rx_ring->rx_stats.alloc_rx_page_failed++;
-               return false;
-       }
-
-       handle &= rx_ring->xsk_umem->chunk_mask;
-
-       hr = umem->headroom + XDP_PACKET_HEADROOM;
-
-       bi->dma = xdp_umem_get_dma(umem, handle);
-       bi->dma += hr;
-
-       bi->addr = xdp_umem_get_data(umem, handle);
-       bi->addr += hr;
-
-       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
-
-       xsk_umem_release_addr_rq(umem);
-       return true;
-}
-
-static __always_inline bool
-__ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
-                           bool alloc(struct ixgbe_ring *rx_ring,
-                                      struct ixgbe_rx_buffer *bi))
+bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
 {
        union ixgbe_adv_rx_desc *rx_desc;
        struct ixgbe_rx_buffer *bi;
        u16 i = rx_ring->next_to_use;
+       dma_addr_t dma;
        bool ok = true;
 
        /* nothing to do */
-       if (!cleaned_count)
+       if (!count)
                return true;
 
        rx_desc = IXGBE_RX_DESC(rx_ring, i);
@@ -327,21 +149,18 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
        i -= rx_ring->count;
 
        do {
-               if (!alloc(rx_ring, bi)) {
+               bi->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
+               if (!bi->xdp) {
                        ok = false;
                        break;
                }
 
-               /* sync the buffer for use by the device */
-               dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
-                                                bi->page_offset,
-                                                rx_ring->rx_buf_len,
-                                                DMA_BIDIRECTIONAL);
+               dma = xsk_buff_xdp_get_dma(bi->xdp);
 
                /* Refresh the desc even if buffer_addrs didn't change
                 * because each write-back erases this info.
                 */
-               rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+               rx_desc->read.pkt_addr = cpu_to_le64(dma);
 
                rx_desc++;
                bi++;
@@ -355,17 +174,14 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
                /* clear the length for the next_to_use descriptor */
                rx_desc->wb.upper.length = 0;
 
-               cleaned_count--;
-       } while (cleaned_count);
+               count--;
+       } while (count);
 
        i += rx_ring->count;
 
        if (rx_ring->next_to_use != i) {
                rx_ring->next_to_use = i;
 
-               /* update next to alloc since we have filled the ring */
-               rx_ring->next_to_alloc = i;
-
                /* Force memory writes to complete before letting h/w
                 * know there are new descriptors to fetch.  (Only
                 * applicable for weak-ordered memory model archs,
@@ -378,40 +194,27 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
        return ok;
 }
 
-void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
-{
-       __ixgbe_alloc_rx_buffers_zc(rx_ring, count,
-                                   ixgbe_alloc_buffer_slow_zc);
-}
-
-static bool ixgbe_alloc_rx_buffers_fast_zc(struct ixgbe_ring *rx_ring,
-                                          u16 count)
-{
-       return __ixgbe_alloc_rx_buffers_zc(rx_ring, count,
-                                          ixgbe_alloc_buffer_zc);
-}
-
 static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
-                                             struct ixgbe_rx_buffer *bi,
-                                             struct xdp_buff *xdp)
+                                             struct ixgbe_rx_buffer *bi)
 {
-       unsigned int metasize = xdp->data - xdp->data_meta;
-       unsigned int datasize = xdp->data_end - xdp->data;
+       unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
+       unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
        struct sk_buff *skb;
 
        /* allocate a skb to store the frags */
        skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-                              xdp->data_end - xdp->data_hard_start,
+                              bi->xdp->data_end - bi->xdp->data_hard_start,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;
 
-       skb_reserve(skb, xdp->data - xdp->data_hard_start);
-       memcpy(__skb_put(skb, datasize), xdp->data, datasize);
+       skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
+       memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
        if (metasize)
                skb_metadata_set(skb, metasize);
 
-       ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+       xsk_buff_free(bi->xdp);
+       bi->xdp = NULL;
        return skb;
 }
 
@@ -431,14 +234,9 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
        unsigned int total_rx_bytes = 0, total_rx_packets = 0;
        struct ixgbe_adapter *adapter = q_vector->adapter;
        u16 cleaned_count = ixgbe_desc_unused(rx_ring);
-       struct xdp_umem *umem = rx_ring->xsk_umem;
        unsigned int xdp_res, xdp_xmit = 0;
        bool failure = false;
        struct sk_buff *skb;
-       struct xdp_buff xdp;
-
-       xdp.rxq = &rx_ring->xdp_rxq;
-       xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
 
        while (likely(total_rx_packets < budget)) {
                union ixgbe_adv_rx_desc *rx_desc;
@@ -448,8 +246,8 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
                /* return some buffers to hardware, one at a time is too slow */
                if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
                        failure = failure ||
-                                 !ixgbe_alloc_rx_buffers_fast_zc(rx_ring,
-                                                                cleaned_count);
+                                 !ixgbe_alloc_rx_buffers_zc(rx_ring,
+                                                            cleaned_count);
                        cleaned_count = 0;
                }
 
@@ -464,42 +262,40 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
                 */
                dma_rmb();
 
-               bi = ixgbe_get_rx_buffer_zc(rx_ring, size);
+               bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
 
                if (unlikely(!ixgbe_test_staterr(rx_desc,
                                                 IXGBE_RXD_STAT_EOP))) {
                        struct ixgbe_rx_buffer *next_bi;
 
-                       ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+                       xsk_buff_free(bi->xdp);
+                       bi->xdp = NULL;
                        ixgbe_inc_ntc(rx_ring);
                        next_bi =
                               &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
-                       next_bi->skb = ERR_PTR(-EINVAL);
+                       next_bi->discard = true;
                        continue;
                }
 
-               if (unlikely(bi->skb)) {
-                       ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
+               if (unlikely(bi->discard)) {
+                       xsk_buff_free(bi->xdp);
+                       bi->xdp = NULL;
+                       bi->discard = false;
                        ixgbe_inc_ntc(rx_ring);
                        continue;
                }
 
-               xdp.data = bi->addr;
-               xdp.data_meta = xdp.data;
-               xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
-               xdp.data_end = xdp.data + size;
-               xdp.handle = bi->handle;
-
-               xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, &xdp);
+               bi->xdp->data_end = bi->xdp->data + size;
+               xsk_buff_dma_sync_for_cpu(bi->xdp);
+               xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
 
                if (xdp_res) {
-                       if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) {
+                       if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR))
                                xdp_xmit |= xdp_res;
-                               bi->addr = NULL;
-                               bi->skb = NULL;
-                       } else {
-                               ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
-                       }
+                       else
+                               xsk_buff_free(bi->xdp);
+
+                       bi->xdp = NULL;
                        total_rx_packets++;
                        total_rx_bytes += size;
 
@@ -509,7 +305,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
                }
 
                /* XDP_PASS path */
-               skb = ixgbe_construct_skb_zc(rx_ring, bi, &xdp);
+               skb = ixgbe_construct_skb_zc(rx_ring, bi);
                if (!skb) {
                        rx_ring->rx_stats.alloc_rx_buff_failed++;
                        break;
@@ -561,17 +357,17 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
 
 void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring)
 {
-       u16 i = rx_ring->next_to_clean;
-       struct ixgbe_rx_buffer *bi = &rx_ring->rx_buffer_info[i];
+       struct ixgbe_rx_buffer *bi;
+       u16 i;
 
-       while (i != rx_ring->next_to_alloc) {
-               xsk_umem_fq_reuse(rx_ring->xsk_umem, bi->handle);
-               i++;
-               bi++;
-               if (i == rx_ring->count) {
-                       i = 0;
-                       bi = rx_ring->rx_buffer_info;
-               }
+       for (i = 0; i < rx_ring->count; i++) {
+               bi = &rx_ring->rx_buffer_info[i];
+
+               if (!bi->xdp)
+                       continue;
+
+               xsk_buff_free(bi->xdp);
+               bi->xdp = NULL;
        }
 }
 
@@ -594,10 +390,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
                if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
                        break;
 
-               dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
-
-               dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
-                                          DMA_BIDIRECTIONAL);
+               dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
+               xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
+                                                desc.len);
 
                tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
                tx_bi->bytecount = desc.len;
index 41d2a0eac5faa2b1ed047a1ee5f2e52a1e46c9bf..37947949345c710abcc03a199fcfe86370c7aad4 100644 (file)
@@ -3566,10 +3566,6 @@ static void mvneta_start_dev(struct mvneta_port *pp)
                    MVNETA_CAUSE_LINK_CHANGE);
 
        phylink_start(pp->phylink);
-
-       /* We may have called phy_speed_down before */
-       phy_speed_up(pp->dev->phydev);
-
        netif_tx_start_all_queues(pp->dev);
 }
 
@@ -3577,9 +3573,6 @@ static void mvneta_stop_dev(struct mvneta_port *pp)
 {
        unsigned int cpu;
 
-       if (device_may_wakeup(&pp->dev->dev))
-               phy_speed_down(pp->dev->phydev, false);
-
        phylink_stop(pp->phylink);
 
        if (!pp->neta_armada3700) {
@@ -4052,10 +4045,6 @@ static int mvneta_mdio_probe(struct mvneta_port *pp)
        phylink_ethtool_get_wol(pp->phylink, &wol);
        device_set_wakeup_capable(&pp->dev->dev, !!wol.supported);
 
-       /* PHY WoL may be enabled but device wakeup disabled */
-       if (wol.supported)
-               device_set_wakeup_enable(&pp->dev->dev, !!wol.wolopts);
-
        return err;
 }
 
index 4968352ba188feb68cc9ac734dc280fdad1c1745..500c15e7ea4a982ac081c9d0d4c726944e93c1f3 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config NET_VENDOR_MEDIATEK
-       bool "MediaTek ethernet driver"
+       bool "MediaTek devices"
        depends on ARCH_MEDIATEK || SOC_MT7621 || SOC_MT7620
        ---help---
          If you have a Mediatek SoC with ethernet, say Y.
@@ -14,4 +14,11 @@ config NET_MEDIATEK_SOC
          This driver supports the gigabit ethernet MACs in the
          MediaTek SoC family.
 
+config NET_MEDIATEK_STAR_EMAC
+       tristate "MediaTek STAR Ethernet MAC support"
+       select PHYLIB
+       help
+         This driver supports the ethernet MAC IP first used on
+         MediaTek MT85** SoCs.
+
 endif #NET_VENDOR_MEDIATEK
index 2d8362f9341bc541f15133499ef8b9b0b4f71938..3a777b4a6cd337b73b9bea1f66f1b0f04c0e5bcf 100644 (file)
@@ -3,5 +3,6 @@
 # Makefile for the Mediatek SoCs built-in ethernet macs
 #
 
-obj-$(CONFIG_NET_MEDIATEK_SOC)                 += mtk_eth.o
+obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o
 mtk_eth-y := mtk_eth_soc.o mtk_sgmii.o mtk_eth_path.o
+obj-$(CONFIG_NET_MEDIATEK_STAR_EMAC) += mtk_star_emac.o
diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c
new file mode 100644 (file)
index 0000000..789c77a
--- /dev/null
@@ -0,0 +1,1678 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 MediaTek Corporation
+ * Copyright (c) 2020 BayLibre SAS
+ *
+ * Author: Bartosz Golaszewski <bgolaszewski@baylibre.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/clk.h>
+#include <linux/compiler.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/regmap.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#define MTK_STAR_DRVNAME                       "mtk_star_emac"
+
+#define MTK_STAR_WAIT_TIMEOUT                  300
+#define MTK_STAR_MAX_FRAME_SIZE                        1514
+#define MTK_STAR_SKB_ALIGNMENT                 16
+#define MTK_STAR_NAPI_WEIGHT                   64
+#define MTK_STAR_HASHTABLE_MC_LIMIT            256
+#define MTK_STAR_HASHTABLE_SIZE_MAX            512
+
+/* Normally we'd use NET_IP_ALIGN but on arm64 its value is 0 and it doesn't
+ * work for this controller.
+ */
+#define MTK_STAR_IP_ALIGN                      2
+
+static const char *const mtk_star_clk_names[] = { "core", "reg", "trans" };
+#define MTK_STAR_NCLKS ARRAY_SIZE(mtk_star_clk_names)
+
+/* PHY Control Register 0 */
+#define MTK_STAR_REG_PHY_CTRL0                 0x0000
+#define MTK_STAR_BIT_PHY_CTRL0_WTCMD           BIT(13)
+#define MTK_STAR_BIT_PHY_CTRL0_RDCMD           BIT(14)
+#define MTK_STAR_BIT_PHY_CTRL0_RWOK            BIT(15)
+#define MTK_STAR_MSK_PHY_CTRL0_PREG            GENMASK(12, 8)
+#define MTK_STAR_OFF_PHY_CTRL0_PREG            8
+#define MTK_STAR_MSK_PHY_CTRL0_RWDATA          GENMASK(31, 16)
+#define MTK_STAR_OFF_PHY_CTRL0_RWDATA          16
+
+/* PHY Control Register 1 */
+#define MTK_STAR_REG_PHY_CTRL1                 0x0004
+#define MTK_STAR_BIT_PHY_CTRL1_LINK_ST         BIT(0)
+#define MTK_STAR_BIT_PHY_CTRL1_AN_EN           BIT(8)
+#define MTK_STAR_OFF_PHY_CTRL1_FORCE_SPD       9
+#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_10M   0x00
+#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_100M  0x01
+#define MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_1000M 0x02
+#define MTK_STAR_BIT_PHY_CTRL1_FORCE_DPX       BIT(11)
+#define MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_RX     BIT(12)
+#define MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_TX     BIT(13)
+
+/* MAC Configuration Register */
+#define MTK_STAR_REG_MAC_CFG                   0x0008
+#define MTK_STAR_OFF_MAC_CFG_IPG               10
+#define MTK_STAR_VAL_MAC_CFG_IPG_96BIT         GENMASK(4, 0)
+#define MTK_STAR_BIT_MAC_CFG_MAXLEN_1522       BIT(16)
+#define MTK_STAR_BIT_MAC_CFG_AUTO_PAD          BIT(19)
+#define MTK_STAR_BIT_MAC_CFG_CRC_STRIP         BIT(20)
+#define MTK_STAR_BIT_MAC_CFG_VLAN_STRIP                BIT(22)
+#define MTK_STAR_BIT_MAC_CFG_NIC_PD            BIT(31)
+
+/* Flow-Control Configuration Register */
+#define MTK_STAR_REG_FC_CFG                    0x000c
+#define MTK_STAR_BIT_FC_CFG_BP_EN              BIT(7)
+#define MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR       BIT(8)
+#define MTK_STAR_OFF_FC_CFG_SEND_PAUSE_TH      16
+#define MTK_STAR_MSK_FC_CFG_SEND_PAUSE_TH      GENMASK(27, 16)
+#define MTK_STAR_VAL_FC_CFG_SEND_PAUSE_TH_2K   0x800
+
+/* ARL Configuration Register */
+#define MTK_STAR_REG_ARL_CFG                   0x0010
+#define MTK_STAR_BIT_ARL_CFG_HASH_ALG          BIT(0)
+#define MTK_STAR_BIT_ARL_CFG_MISC_MODE         BIT(4)
+
+/* MAC High and Low Bytes Registers */
+#define MTK_STAR_REG_MY_MAC_H                  0x0014
+#define MTK_STAR_REG_MY_MAC_L                  0x0018
+
+/* Hash Table Control Register */
+#define MTK_STAR_REG_HASH_CTRL                 0x001c
+#define MTK_STAR_MSK_HASH_CTRL_HASH_BIT_ADDR   GENMASK(8, 0)
+#define MTK_STAR_BIT_HASH_CTRL_HASH_BIT_DATA   BIT(12)
+#define MTK_STAR_BIT_HASH_CTRL_ACC_CMD         BIT(13)
+#define MTK_STAR_BIT_HASH_CTRL_CMD_START       BIT(14)
+#define MTK_STAR_BIT_HASH_CTRL_BIST_OK         BIT(16)
+#define MTK_STAR_BIT_HASH_CTRL_BIST_DONE       BIT(17)
+#define MTK_STAR_BIT_HASH_CTRL_BIST_EN         BIT(31)
+
+/* TX DMA Control Register */
+#define MTK_STAR_REG_TX_DMA_CTRL               0x0034
+#define MTK_STAR_BIT_TX_DMA_CTRL_START         BIT(0)
+#define MTK_STAR_BIT_TX_DMA_CTRL_STOP          BIT(1)
+#define MTK_STAR_BIT_TX_DMA_CTRL_RESUME                BIT(2)
+
+/* RX DMA Control Register */
+#define MTK_STAR_REG_RX_DMA_CTRL               0x0038
+#define MTK_STAR_BIT_RX_DMA_CTRL_START         BIT(0)
+#define MTK_STAR_BIT_RX_DMA_CTRL_STOP          BIT(1)
+#define MTK_STAR_BIT_RX_DMA_CTRL_RESUME                BIT(2)
+
+/* DMA Address Registers */
+#define MTK_STAR_REG_TX_DPTR                   0x003c
+#define MTK_STAR_REG_RX_DPTR                   0x0040
+#define MTK_STAR_REG_TX_BASE_ADDR              0x0044
+#define MTK_STAR_REG_RX_BASE_ADDR              0x0048
+
+/* Interrupt Status Register */
+#define MTK_STAR_REG_INT_STS                   0x0050
+#define MTK_STAR_REG_INT_STS_PORT_STS_CHG      BIT(2)
+#define MTK_STAR_REG_INT_STS_MIB_CNT_TH                BIT(3)
+#define MTK_STAR_BIT_INT_STS_FNRC              BIT(6)
+#define MTK_STAR_BIT_INT_STS_TNTC              BIT(8)
+
+/* Interrupt Mask Register */
+#define MTK_STAR_REG_INT_MASK                  0x0054
+#define MTK_STAR_BIT_INT_MASK_FNRC             BIT(6)
+
+/* Misc. Config Register */
+#define MTK_STAR_REG_TEST1                     0x005c
+#define MTK_STAR_BIT_TEST1_RST_HASH_MBIST      BIT(31)
+
+/* Extended Configuration Register */
+#define MTK_STAR_REG_EXT_CFG                   0x0060
+#define MTK_STAR_OFF_EXT_CFG_SND_PAUSE_RLS     16
+#define MTK_STAR_MSK_EXT_CFG_SND_PAUSE_RLS     GENMASK(26, 16)
+#define MTK_STAR_VAL_EXT_CFG_SND_PAUSE_RLS_1K  0x400
+
+/* EthSys Configuration Register */
+#define MTK_STAR_REG_SYS_CONF                  0x0094
+#define MTK_STAR_BIT_MII_PAD_OUT_ENABLE                BIT(0)
+#define MTK_STAR_BIT_EXT_MDC_MODE              BIT(1)
+#define MTK_STAR_BIT_SWC_MII_MODE              BIT(2)
+
+/* MAC Clock Configuration Register */
+#define MTK_STAR_REG_MAC_CLK_CONF              0x00ac
+#define MTK_STAR_MSK_MAC_CLK_CONF              GENMASK(7, 0)
+#define MTK_STAR_BIT_CLK_DIV_10                        0x0a
+
+/* Counter registers. */
+#define MTK_STAR_REG_C_RXOKPKT                 0x0100
+#define MTK_STAR_REG_C_RXOKBYTE                        0x0104
+#define MTK_STAR_REG_C_RXRUNT                  0x0108
+#define MTK_STAR_REG_C_RXLONG                  0x010c
+#define MTK_STAR_REG_C_RXDROP                  0x0110
+#define MTK_STAR_REG_C_RXCRC                   0x0114
+#define MTK_STAR_REG_C_RXARLDROP               0x0118
+#define MTK_STAR_REG_C_RXVLANDROP              0x011c
+#define MTK_STAR_REG_C_RXCSERR                 0x0120
+#define MTK_STAR_REG_C_RXPAUSE                 0x0124
+#define MTK_STAR_REG_C_TXOKPKT                 0x0128
+#define MTK_STAR_REG_C_TXOKBYTE                        0x012c
+#define MTK_STAR_REG_C_TXPAUSECOL              0x0130
+#define MTK_STAR_REG_C_TXRTY                   0x0134
+#define MTK_STAR_REG_C_TXSKIP                  0x0138
+#define MTK_STAR_REG_C_TX_ARP                  0x013c
+#define MTK_STAR_REG_C_RX_RERR                 0x01d8
+#define MTK_STAR_REG_C_RX_UNI                  0x01dc
+#define MTK_STAR_REG_C_RX_MULTI                        0x01e0
+#define MTK_STAR_REG_C_RX_BROAD                        0x01e4
+#define MTK_STAR_REG_C_RX_ALIGNERR             0x01e8
+#define MTK_STAR_REG_C_TX_UNI                  0x01ec
+#define MTK_STAR_REG_C_TX_MULTI                        0x01f0
+#define MTK_STAR_REG_C_TX_BROAD                        0x01f4
+#define MTK_STAR_REG_C_TX_TIMEOUT              0x01f8
+#define MTK_STAR_REG_C_TX_LATECOL              0x01fc
+#define MTK_STAR_REG_C_RX_LENGTHERR            0x0214
+#define MTK_STAR_REG_C_RX_TWIST                        0x0218
+
+/* Ethernet CFG Control */
+#define MTK_PERICFG_REG_NIC_CFG_CON            0x03c4
+#define MTK_PERICFG_MSK_NIC_CFG_CON_CFG_MII    GENMASK(3, 0)
+#define MTK_PERICFG_BIT_NIC_CFG_CON_RMII       BIT(0)
+
+/* Represents the actual structure of descriptors used by the MAC. We can
+ * reuse the same structure for both TX and RX - the layout is the same, only
+ * the flags differ slightly.
+ */
+struct mtk_star_ring_desc {
+       /* Contains both the status flags as well as packet length. */
+       u32 status;
+       u32 data_ptr;
+       u32 vtag;
+       u32 reserved;
+};
+
+#define MTK_STAR_DESC_MSK_LEN                  GENMASK(15, 0)
+#define MTK_STAR_DESC_BIT_RX_CRCE              BIT(24)
+#define MTK_STAR_DESC_BIT_RX_OSIZE             BIT(25)
+#define MTK_STAR_DESC_BIT_INT                  BIT(27)
+#define MTK_STAR_DESC_BIT_LS                   BIT(28)
+#define MTK_STAR_DESC_BIT_FS                   BIT(29)
+#define MTK_STAR_DESC_BIT_EOR                  BIT(30)
+#define MTK_STAR_DESC_BIT_COWN                 BIT(31)
+
+/* Helper structure for storing data read from/written to descriptors in order
+ * to limit reads from/writes to DMA memory.
+ */
+struct mtk_star_ring_desc_data {
+       unsigned int len;
+       unsigned int flags;
+       dma_addr_t dma_addr;
+       struct sk_buff *skb;
+};
+
+#define MTK_STAR_RING_NUM_DESCS                        128
+#define MTK_STAR_NUM_TX_DESCS                  MTK_STAR_RING_NUM_DESCS
+#define MTK_STAR_NUM_RX_DESCS                  MTK_STAR_RING_NUM_DESCS
+#define MTK_STAR_NUM_DESCS_TOTAL               (MTK_STAR_RING_NUM_DESCS * 2)
+#define MTK_STAR_DMA_SIZE \
+               (MTK_STAR_NUM_DESCS_TOTAL * sizeof(struct mtk_star_ring_desc))
+
+struct mtk_star_ring {
+       struct mtk_star_ring_desc *descs;
+       struct sk_buff *skbs[MTK_STAR_RING_NUM_DESCS];
+       dma_addr_t dma_addrs[MTK_STAR_RING_NUM_DESCS];
+       unsigned int head;
+       unsigned int tail;
+};
+
+struct mtk_star_priv {
+       struct net_device *ndev;
+
+       struct regmap *regs;
+       struct regmap *pericfg;
+
+       struct clk_bulk_data clks[MTK_STAR_NCLKS];
+
+       void *ring_base;
+       struct mtk_star_ring_desc *descs_base;
+       dma_addr_t dma_addr;
+       struct mtk_star_ring tx_ring;
+       struct mtk_star_ring rx_ring;
+
+       struct mii_bus *mii;
+       struct napi_struct napi;
+
+       struct device_node *phy_node;
+       phy_interface_t phy_intf;
+       struct phy_device *phydev;
+       unsigned int link;
+       int speed;
+       int duplex;
+       int pause;
+
+       /* Protects against concurrent descriptor access. */
+       spinlock_t lock;
+
+       struct rtnl_link_stats64 stats;
+       struct work_struct stats_work;
+};
+
+static struct device *mtk_star_get_dev(struct mtk_star_priv *priv)
+{
+       return priv->ndev->dev.parent;
+}
+
+static const struct regmap_config mtk_star_regmap_config = {
+       .reg_bits               = 32,
+       .val_bits               = 32,
+       .reg_stride             = 4,
+       .disable_locking        = true,
+};
+
+static void mtk_star_ring_init(struct mtk_star_ring *ring,
+                              struct mtk_star_ring_desc *descs)
+{
+       memset(ring, 0, sizeof(*ring));
+       ring->descs = descs;
+       ring->head = 0;
+       ring->tail = 0;
+}
+
+static int mtk_star_ring_pop_tail(struct mtk_star_ring *ring,
+                                 struct mtk_star_ring_desc_data *desc_data)
+{
+       struct mtk_star_ring_desc *desc = &ring->descs[ring->tail];
+       unsigned int status;
+
+       status = READ_ONCE(desc->status);
+       dma_rmb(); /* Make sure we read the status bits before checking it. */
+
+       if (!(status & MTK_STAR_DESC_BIT_COWN))
+               return -1;
+
+       desc_data->len = status & MTK_STAR_DESC_MSK_LEN;
+       desc_data->flags = status & ~MTK_STAR_DESC_MSK_LEN;
+       desc_data->dma_addr = ring->dma_addrs[ring->tail];
+       desc_data->skb = ring->skbs[ring->tail];
+
+       ring->dma_addrs[ring->tail] = 0;
+       ring->skbs[ring->tail] = NULL;
+
+       status &= MTK_STAR_DESC_BIT_COWN | MTK_STAR_DESC_BIT_EOR;
+
+       WRITE_ONCE(desc->data_ptr, 0);
+       WRITE_ONCE(desc->status, status);
+
+       ring->tail = (ring->tail + 1) % MTK_STAR_RING_NUM_DESCS;
+
+       return 0;
+}
+
+static void mtk_star_ring_push_head(struct mtk_star_ring *ring,
+                                   struct mtk_star_ring_desc_data *desc_data,
+                                   unsigned int flags)
+{
+       struct mtk_star_ring_desc *desc = &ring->descs[ring->head];
+       unsigned int status;
+
+       status = READ_ONCE(desc->status);
+
+       ring->skbs[ring->head] = desc_data->skb;
+       ring->dma_addrs[ring->head] = desc_data->dma_addr;
+
+       status |= desc_data->len;
+       if (flags)
+               status |= flags;
+
+       WRITE_ONCE(desc->data_ptr, desc_data->dma_addr);
+       WRITE_ONCE(desc->status, status);
+       status &= ~MTK_STAR_DESC_BIT_COWN;
+       /* Flush previous modifications before ownership change. */
+       dma_wmb();
+       WRITE_ONCE(desc->status, status);
+
+       ring->head = (ring->head + 1) % MTK_STAR_RING_NUM_DESCS;
+}
+
+static void
+mtk_star_ring_push_head_rx(struct mtk_star_ring *ring,
+                          struct mtk_star_ring_desc_data *desc_data)
+{
+       mtk_star_ring_push_head(ring, desc_data, 0);
+}
+
+static void
+mtk_star_ring_push_head_tx(struct mtk_star_ring *ring,
+                          struct mtk_star_ring_desc_data *desc_data)
+{
+       static const unsigned int flags = MTK_STAR_DESC_BIT_FS |
+                                         MTK_STAR_DESC_BIT_LS |
+                                         MTK_STAR_DESC_BIT_INT;
+
+       mtk_star_ring_push_head(ring, desc_data, flags);
+}
+
+static unsigned int mtk_star_ring_num_used_descs(struct mtk_star_ring *ring)
+{
+       return abs(ring->head - ring->tail);
+}
+
+static bool mtk_star_ring_full(struct mtk_star_ring *ring)
+{
+       return mtk_star_ring_num_used_descs(ring) == MTK_STAR_RING_NUM_DESCS;
+}
+
+static bool mtk_star_ring_descs_available(struct mtk_star_ring *ring)
+{
+       return mtk_star_ring_num_used_descs(ring) > 0;
+}
+
+static dma_addr_t mtk_star_dma_map_rx(struct mtk_star_priv *priv,
+                                     struct sk_buff *skb)
+{
+       struct device *dev = mtk_star_get_dev(priv);
+
+       /* Data pointer for the RX DMA descriptor must be aligned to 4N + 2. */
+       return dma_map_single(dev, skb_tail_pointer(skb) - 2,
+                             skb_tailroom(skb), DMA_FROM_DEVICE);
+}
+
+static void mtk_star_dma_unmap_rx(struct mtk_star_priv *priv,
+                                 struct mtk_star_ring_desc_data *desc_data)
+{
+       struct device *dev = mtk_star_get_dev(priv);
+
+       dma_unmap_single(dev, desc_data->dma_addr,
+                        skb_tailroom(desc_data->skb), DMA_FROM_DEVICE);
+}
+
+static dma_addr_t mtk_star_dma_map_tx(struct mtk_star_priv *priv,
+                                     struct sk_buff *skb)
+{
+       struct device *dev = mtk_star_get_dev(priv);
+
+       return dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
+}
+
+static void mtk_star_dma_unmap_tx(struct mtk_star_priv *priv,
+                                 struct mtk_star_ring_desc_data *desc_data)
+{
+       struct device *dev = mtk_star_get_dev(priv);
+
+       return dma_unmap_single(dev, desc_data->dma_addr,
+                               skb_headlen(desc_data->skb), DMA_TO_DEVICE);
+}
+
+static void mtk_star_nic_disable_pd(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG,
+                          MTK_STAR_BIT_MAC_CFG_NIC_PD, 0);
+}
+
+/* Unmask the three interrupts we care about, mask all others. */
+static void mtk_star_intr_enable(struct mtk_star_priv *priv)
+{
+       unsigned int val = MTK_STAR_BIT_INT_STS_TNTC |
+                          MTK_STAR_BIT_INT_STS_FNRC |
+                          MTK_STAR_REG_INT_STS_MIB_CNT_TH;
+
+       regmap_write(priv->regs, MTK_STAR_REG_INT_MASK, ~val);
+}
+
+static void mtk_star_intr_disable(struct mtk_star_priv *priv)
+{
+       regmap_write(priv->regs, MTK_STAR_REG_INT_MASK, ~0);
+}
+
+static void mtk_star_intr_enable_tx(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+                          MTK_STAR_BIT_INT_STS_TNTC, 0);
+}
+
+static void mtk_star_intr_enable_rx(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+                          MTK_STAR_BIT_INT_STS_FNRC, 0);
+}
+
+static void mtk_star_intr_enable_stats(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+                          MTK_STAR_REG_INT_STS_MIB_CNT_TH, 0);
+}
+
+static void mtk_star_intr_disable_tx(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+                          MTK_STAR_BIT_INT_STS_TNTC,
+                          MTK_STAR_BIT_INT_STS_TNTC);
+}
+
+static void mtk_star_intr_disable_rx(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+                          MTK_STAR_BIT_INT_STS_FNRC,
+                          MTK_STAR_BIT_INT_STS_FNRC);
+}
+
+static void mtk_star_intr_disable_stats(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_INT_MASK,
+                          MTK_STAR_REG_INT_STS_MIB_CNT_TH,
+                          MTK_STAR_REG_INT_STS_MIB_CNT_TH);
+}
+
+static unsigned int mtk_star_intr_read(struct mtk_star_priv *priv)
+{
+       unsigned int val;
+
+       regmap_read(priv->regs, MTK_STAR_REG_INT_STS, &val);
+
+       return val;
+}
+
+static unsigned int mtk_star_intr_ack_all(struct mtk_star_priv *priv)
+{
+       unsigned int val;
+
+       val = mtk_star_intr_read(priv);
+       regmap_write(priv->regs, MTK_STAR_REG_INT_STS, val);
+
+       return val;
+}
+
+static void mtk_star_dma_init(struct mtk_star_priv *priv)
+{
+       struct mtk_star_ring_desc *desc;
+       unsigned int val;
+       int i;
+
+       priv->descs_base = (struct mtk_star_ring_desc *)priv->ring_base;
+
+       for (i = 0; i < MTK_STAR_NUM_DESCS_TOTAL; i++) {
+               desc = &priv->descs_base[i];
+
+               memset(desc, 0, sizeof(*desc));
+               desc->status = MTK_STAR_DESC_BIT_COWN;
+               if ((i == MTK_STAR_NUM_TX_DESCS - 1) ||
+                   (i == MTK_STAR_NUM_DESCS_TOTAL - 1))
+                       desc->status |= MTK_STAR_DESC_BIT_EOR;
+       }
+
+       mtk_star_ring_init(&priv->tx_ring, priv->descs_base);
+       mtk_star_ring_init(&priv->rx_ring,
+                          priv->descs_base + MTK_STAR_NUM_TX_DESCS);
+
+       /* Set DMA pointers. */
+       val = (unsigned int)priv->dma_addr;
+       regmap_write(priv->regs, MTK_STAR_REG_TX_BASE_ADDR, val);
+       regmap_write(priv->regs, MTK_STAR_REG_TX_DPTR, val);
+
+       val += sizeof(struct mtk_star_ring_desc) * MTK_STAR_NUM_TX_DESCS;
+       regmap_write(priv->regs, MTK_STAR_REG_RX_BASE_ADDR, val);
+       regmap_write(priv->regs, MTK_STAR_REG_RX_DPTR, val);
+}
+
+static void mtk_star_dma_start(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL,
+                          MTK_STAR_BIT_TX_DMA_CTRL_START,
+                          MTK_STAR_BIT_TX_DMA_CTRL_START);
+       regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL,
+                          MTK_STAR_BIT_RX_DMA_CTRL_START,
+                          MTK_STAR_BIT_RX_DMA_CTRL_START);
+}
+
+static void mtk_star_dma_stop(struct mtk_star_priv *priv)
+{
+       regmap_write(priv->regs, MTK_STAR_REG_TX_DMA_CTRL,
+                    MTK_STAR_BIT_TX_DMA_CTRL_STOP);
+       regmap_write(priv->regs, MTK_STAR_REG_RX_DMA_CTRL,
+                    MTK_STAR_BIT_RX_DMA_CTRL_STOP);
+}
+
+static void mtk_star_dma_disable(struct mtk_star_priv *priv)
+{
+       int i;
+
+       mtk_star_dma_stop(priv);
+
+       /* Take back all descriptors. */
+       for (i = 0; i < MTK_STAR_NUM_DESCS_TOTAL; i++)
+               priv->descs_base[i].status |= MTK_STAR_DESC_BIT_COWN;
+}
+
+static void mtk_star_dma_resume_rx(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_RX_DMA_CTRL,
+                          MTK_STAR_BIT_RX_DMA_CTRL_RESUME,
+                          MTK_STAR_BIT_RX_DMA_CTRL_RESUME);
+}
+
+static void mtk_star_dma_resume_tx(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->regs, MTK_STAR_REG_TX_DMA_CTRL,
+                          MTK_STAR_BIT_TX_DMA_CTRL_RESUME,
+                          MTK_STAR_BIT_TX_DMA_CTRL_RESUME);
+}
+
+static void mtk_star_set_mac_addr(struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+       u8 *mac_addr = ndev->dev_addr;
+       unsigned int high, low;
+
+       high = mac_addr[0] << 8 | mac_addr[1] << 0;
+       low = mac_addr[2] << 24 | mac_addr[3] << 16 |
+             mac_addr[4] << 8 | mac_addr[5];
+
+       regmap_write(priv->regs, MTK_STAR_REG_MY_MAC_H, high);
+       regmap_write(priv->regs, MTK_STAR_REG_MY_MAC_L, low);
+}
+
+static void mtk_star_reset_counters(struct mtk_star_priv *priv)
+{
+       static const unsigned int counter_regs[] = {
+               MTK_STAR_REG_C_RXOKPKT,
+               MTK_STAR_REG_C_RXOKBYTE,
+               MTK_STAR_REG_C_RXRUNT,
+               MTK_STAR_REG_C_RXLONG,
+               MTK_STAR_REG_C_RXDROP,
+               MTK_STAR_REG_C_RXCRC,
+               MTK_STAR_REG_C_RXARLDROP,
+               MTK_STAR_REG_C_RXVLANDROP,
+               MTK_STAR_REG_C_RXCSERR,
+               MTK_STAR_REG_C_RXPAUSE,
+               MTK_STAR_REG_C_TXOKPKT,
+               MTK_STAR_REG_C_TXOKBYTE,
+               MTK_STAR_REG_C_TXPAUSECOL,
+               MTK_STAR_REG_C_TXRTY,
+               MTK_STAR_REG_C_TXSKIP,
+               MTK_STAR_REG_C_TX_ARP,
+               MTK_STAR_REG_C_RX_RERR,
+               MTK_STAR_REG_C_RX_UNI,
+               MTK_STAR_REG_C_RX_MULTI,
+               MTK_STAR_REG_C_RX_BROAD,
+               MTK_STAR_REG_C_RX_ALIGNERR,
+               MTK_STAR_REG_C_TX_UNI,
+               MTK_STAR_REG_C_TX_MULTI,
+               MTK_STAR_REG_C_TX_BROAD,
+               MTK_STAR_REG_C_TX_TIMEOUT,
+               MTK_STAR_REG_C_TX_LATECOL,
+               MTK_STAR_REG_C_RX_LENGTHERR,
+               MTK_STAR_REG_C_RX_TWIST,
+       };
+
+       unsigned int i, val;
+
+       for (i = 0; i < ARRAY_SIZE(counter_regs); i++)
+               regmap_read(priv->regs, counter_regs[i], &val);
+}
+
+static void mtk_star_update_stat(struct mtk_star_priv *priv,
+                                unsigned int reg, u64 *stat)
+{
+       unsigned int val;
+
+       regmap_read(priv->regs, reg, &val);
+       *stat += val;
+}
+
+/* Try to get as many stats as possible from the internal registers instead
+ * of tracking them ourselves.
+ */
+static void mtk_star_update_stats(struct mtk_star_priv *priv)
+{
+       struct rtnl_link_stats64 *stats = &priv->stats;
+
+       /* OK packets and bytes. */
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RXOKPKT, &stats->rx_packets);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_TXOKPKT, &stats->tx_packets);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RXOKBYTE, &stats->rx_bytes);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_TXOKBYTE, &stats->tx_bytes);
+
+       /* RX & TX multicast. */
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_MULTI, &stats->multicast);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_TX_MULTI, &stats->multicast);
+
+       /* Collisions. */
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_TXPAUSECOL,
+                            &stats->collisions);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_TX_LATECOL,
+                            &stats->collisions);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RXRUNT, &stats->collisions);
+
+       /* RX Errors. */
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_LENGTHERR,
+                            &stats->rx_length_errors);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RXLONG,
+                            &stats->rx_over_errors);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RXCRC, &stats->rx_crc_errors);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_ALIGNERR,
+                            &stats->rx_frame_errors);
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RXDROP,
+                            &stats->rx_fifo_errors);
+       /* Sum of the general RX error counter + all of the above. */
+       mtk_star_update_stat(priv, MTK_STAR_REG_C_RX_RERR, &stats->rx_errors);
+       stats->rx_errors += stats->rx_length_errors;
+       stats->rx_errors += stats->rx_over_errors;
+       stats->rx_errors += stats->rx_crc_errors;
+       stats->rx_errors += stats->rx_frame_errors;
+       stats->rx_errors += stats->rx_fifo_errors;
+}
+
+/* This runs in process context and parallel TX and RX paths executing in
+ * napi context may result in losing some stats data but this should happen
+ * seldom enough to be acceptable.
+ */
+static void mtk_star_update_stats_work(struct work_struct *work)
+{
+       struct mtk_star_priv *priv = container_of(work, struct mtk_star_priv,
+                                                stats_work);
+
+       mtk_star_update_stats(priv);
+       mtk_star_reset_counters(priv);
+       mtk_star_intr_enable_stats(priv);
+}
+
+static struct sk_buff *mtk_star_alloc_skb(struct net_device *ndev)
+{
+       uintptr_t tail, offset;
+       struct sk_buff *skb;
+
+       skb = dev_alloc_skb(MTK_STAR_MAX_FRAME_SIZE);
+       if (!skb)
+               return NULL;
+
+       /* Align to 16 bytes. */
+       tail = (uintptr_t)skb_tail_pointer(skb);
+       if (tail & (MTK_STAR_SKB_ALIGNMENT - 1)) {
+               offset = tail & (MTK_STAR_SKB_ALIGNMENT - 1);
+               skb_reserve(skb, MTK_STAR_SKB_ALIGNMENT - offset);
+       }
+
+       /* Ensure 16-byte alignment of the skb pointer: eth_type_trans() will
+        * extract the Ethernet header (14 bytes) so we need two more bytes.
+        */
+       skb_reserve(skb, MTK_STAR_IP_ALIGN);
+
+       return skb;
+}
+
+static int mtk_star_prepare_rx_skbs(struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+       struct mtk_star_ring *ring = &priv->rx_ring;
+       struct device *dev = mtk_star_get_dev(priv);
+       struct mtk_star_ring_desc *desc;
+       struct sk_buff *skb;
+       dma_addr_t dma_addr;
+       int i;
+
+       for (i = 0; i < MTK_STAR_NUM_RX_DESCS; i++) {
+               skb = mtk_star_alloc_skb(ndev);
+               if (!skb)
+                       return -ENOMEM;
+
+               dma_addr = mtk_star_dma_map_rx(priv, skb);
+               if (dma_mapping_error(dev, dma_addr)) {
+                       dev_kfree_skb(skb);
+                       return -ENOMEM;
+               }
+
+               desc = &ring->descs[i];
+               desc->data_ptr = dma_addr;
+               desc->status |= skb_tailroom(skb) & MTK_STAR_DESC_MSK_LEN;
+               desc->status &= ~MTK_STAR_DESC_BIT_COWN;
+               ring->skbs[i] = skb;
+               ring->dma_addrs[i] = dma_addr;
+       }
+
+       return 0;
+}
+
+static void
+mtk_star_ring_free_skbs(struct mtk_star_priv *priv, struct mtk_star_ring *ring,
+                       void (*unmap_func)(struct mtk_star_priv *,
+                                          struct mtk_star_ring_desc_data *))
+{
+       struct mtk_star_ring_desc_data desc_data;
+       struct mtk_star_ring_desc *desc;
+       int i;
+
+       for (i = 0; i < MTK_STAR_RING_NUM_DESCS; i++) {
+               if (!ring->dma_addrs[i])
+                       continue;
+
+               desc = &ring->descs[i];
+
+               desc_data.dma_addr = ring->dma_addrs[i];
+               desc_data.skb = ring->skbs[i];
+
+               unmap_func(priv, &desc_data);
+               dev_kfree_skb(desc_data.skb);
+       }
+}
+
+static void mtk_star_free_rx_skbs(struct mtk_star_priv *priv)
+{
+       struct mtk_star_ring *ring = &priv->rx_ring;
+
+       mtk_star_ring_free_skbs(priv, ring, mtk_star_dma_unmap_rx);
+}
+
+static void mtk_star_free_tx_skbs(struct mtk_star_priv *priv)
+{
+       struct mtk_star_ring *ring = &priv->tx_ring;
+
+       mtk_star_ring_free_skbs(priv, ring, mtk_star_dma_unmap_tx);
+}
+
+/* All processing for TX and RX happens in the napi poll callback. */
+static irqreturn_t mtk_star_handle_irq(int irq, void *data)
+{
+       struct mtk_star_priv *priv;
+       struct net_device *ndev;
+       bool need_napi = false;
+       unsigned int status;
+
+       ndev = data;
+       priv = netdev_priv(ndev);
+
+       if (netif_running(ndev)) {
+               status = mtk_star_intr_read(priv);
+
+               if (status & MTK_STAR_BIT_INT_STS_TNTC) {
+                       mtk_star_intr_disable_tx(priv);
+                       need_napi = true;
+               }
+
+               if (status & MTK_STAR_BIT_INT_STS_FNRC) {
+                       mtk_star_intr_disable_rx(priv);
+                       need_napi = true;
+               }
+
+               if (need_napi)
+                       napi_schedule(&priv->napi);
+
+               /* One of the counters reached 0x8000000 - update stats and
+                * reset all counters.
+                */
+               if (unlikely(status & MTK_STAR_REG_INT_STS_MIB_CNT_TH)) {
+                       mtk_star_intr_disable_stats(priv);
+                       schedule_work(&priv->stats_work);
+               }
+
+               mtk_star_intr_ack_all(priv);
+       }
+
+       return IRQ_HANDLED;
+}
+
+/* Wait for the completion of any previous command - CMD_START bit must be
+ * cleared by hardware.
+ */
+static int mtk_star_hash_wait_cmd_start(struct mtk_star_priv *priv)
+{
+       unsigned int val;
+
+       return regmap_read_poll_timeout_atomic(priv->regs,
+                               MTK_STAR_REG_HASH_CTRL, val,
+                               !(val & MTK_STAR_BIT_HASH_CTRL_CMD_START),
+                               10, MTK_STAR_WAIT_TIMEOUT);
+}
+
+static int mtk_star_hash_wait_ok(struct mtk_star_priv *priv)
+{
+       unsigned int val;
+       int ret;
+
+       /* Wait for BIST_DONE bit. */
+       ret = regmap_read_poll_timeout_atomic(priv->regs,
+                                       MTK_STAR_REG_HASH_CTRL, val,
+                                       val & MTK_STAR_BIT_HASH_CTRL_BIST_DONE,
+                                       10, MTK_STAR_WAIT_TIMEOUT);
+       if (ret)
+               return ret;
+
+       /* Check the BIST_OK bit. */
+       regmap_read(priv->regs, MTK_STAR_REG_HASH_CTRL, &val);
+       if (!(val & MTK_STAR_BIT_HASH_CTRL_BIST_OK))
+               return -EIO;
+
+       return 0;
+}
+
+static int mtk_star_set_hashbit(struct mtk_star_priv *priv,
+                               unsigned int hash_addr)
+{
+       unsigned int val;
+       int ret;
+
+       ret = mtk_star_hash_wait_cmd_start(priv);
+       if (ret)
+               return ret;
+
+       val = hash_addr & MTK_STAR_MSK_HASH_CTRL_HASH_BIT_ADDR;
+       val |= MTK_STAR_BIT_HASH_CTRL_ACC_CMD;
+       val |= MTK_STAR_BIT_HASH_CTRL_CMD_START;
+       val |= MTK_STAR_BIT_HASH_CTRL_BIST_EN;
+       val |= MTK_STAR_BIT_HASH_CTRL_HASH_BIT_DATA;
+       regmap_write(priv->regs, MTK_STAR_REG_HASH_CTRL, val);
+
+       return mtk_star_hash_wait_ok(priv);
+}
+
+static int mtk_star_reset_hash_table(struct mtk_star_priv *priv)
+{
+       int ret;
+
+       ret = mtk_star_hash_wait_cmd_start(priv);
+       if (ret)
+               return ret;
+
+       regmap_update_bits(priv->regs, MTK_STAR_REG_HASH_CTRL,
+                          MTK_STAR_BIT_HASH_CTRL_BIST_EN,
+                          MTK_STAR_BIT_HASH_CTRL_BIST_EN);
+       regmap_update_bits(priv->regs, MTK_STAR_REG_TEST1,
+                          MTK_STAR_BIT_TEST1_RST_HASH_MBIST,
+                          MTK_STAR_BIT_TEST1_RST_HASH_MBIST);
+
+       return mtk_star_hash_wait_ok(priv);
+}
+
+static void mtk_star_phy_config(struct mtk_star_priv *priv)
+{
+       unsigned int val;
+
+       if (priv->speed == SPEED_1000)
+               val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_1000M;
+       else if (priv->speed == SPEED_100)
+               val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_100M;
+       else
+               val = MTK_STAR_VAL_PHY_CTRL1_FORCE_SPD_10M;
+       val <<= MTK_STAR_OFF_PHY_CTRL1_FORCE_SPD;
+
+       val |= MTK_STAR_BIT_PHY_CTRL1_AN_EN;
+       val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_RX;
+       val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_FC_TX;
+       /* Only full-duplex supported for now. */
+       val |= MTK_STAR_BIT_PHY_CTRL1_FORCE_DPX;
+
+       regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL1, val);
+
+       if (priv->pause) {
+               val = MTK_STAR_VAL_FC_CFG_SEND_PAUSE_TH_2K;
+               val <<= MTK_STAR_OFF_FC_CFG_SEND_PAUSE_TH;
+               val |= MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR;
+       } else {
+               val = 0;
+       }
+
+       regmap_update_bits(priv->regs, MTK_STAR_REG_FC_CFG,
+                          MTK_STAR_MSK_FC_CFG_SEND_PAUSE_TH |
+                          MTK_STAR_BIT_FC_CFG_UC_PAUSE_DIR, val);
+
+       if (priv->pause) {
+               val = MTK_STAR_VAL_EXT_CFG_SND_PAUSE_RLS_1K;
+               val <<= MTK_STAR_OFF_EXT_CFG_SND_PAUSE_RLS;
+       } else {
+               val = 0;
+       }
+
+       regmap_update_bits(priv->regs, MTK_STAR_REG_EXT_CFG,
+                          MTK_STAR_MSK_EXT_CFG_SND_PAUSE_RLS, val);
+}
+
+static void mtk_star_adjust_link(struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+       struct phy_device *phydev = priv->phydev;
+       bool new_state = false;
+
+       if (phydev->link) {
+               if (!priv->link) {
+                       priv->link = phydev->link;
+                       new_state = true;
+               }
+
+               if (priv->speed != phydev->speed) {
+                       priv->speed = phydev->speed;
+                       new_state = true;
+               }
+
+               if (priv->pause != phydev->pause) {
+                       priv->pause = phydev->pause;
+                       new_state = true;
+               }
+       } else {
+               if (priv->link) {
+                       priv->link = phydev->link;
+                       new_state = true;
+               }
+       }
+
+       if (new_state) {
+               if (phydev->link)
+                       mtk_star_phy_config(priv);
+
+               phy_print_status(ndev->phydev);
+       }
+}
+
+static void mtk_star_init_config(struct mtk_star_priv *priv)
+{
+       unsigned int val;
+
+       val = (MTK_STAR_BIT_MII_PAD_OUT_ENABLE |
+              MTK_STAR_BIT_EXT_MDC_MODE |
+              MTK_STAR_BIT_SWC_MII_MODE);
+
+       regmap_write(priv->regs, MTK_STAR_REG_SYS_CONF, val);
+       regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CLK_CONF,
+                          MTK_STAR_MSK_MAC_CLK_CONF,
+                          MTK_STAR_BIT_CLK_DIV_10);
+}
+
+static void mtk_star_set_mode_rmii(struct mtk_star_priv *priv)
+{
+       regmap_update_bits(priv->pericfg, MTK_PERICFG_REG_NIC_CFG_CON,
+                          MTK_PERICFG_MSK_NIC_CFG_CON_CFG_MII,
+                          MTK_PERICFG_BIT_NIC_CFG_CON_RMII);
+}
+
+static int mtk_star_enable(struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+       unsigned int val;
+       int ret;
+
+       mtk_star_nic_disable_pd(priv);
+       mtk_star_intr_disable(priv);
+       mtk_star_dma_stop(priv);
+
+       mtk_star_set_mac_addr(ndev);
+
+       /* Configure the MAC */
+       val = MTK_STAR_VAL_MAC_CFG_IPG_96BIT;
+       val <<= MTK_STAR_OFF_MAC_CFG_IPG;
+       val |= MTK_STAR_BIT_MAC_CFG_MAXLEN_1522;
+       val |= MTK_STAR_BIT_MAC_CFG_AUTO_PAD;
+       val |= MTK_STAR_BIT_MAC_CFG_CRC_STRIP;
+       regmap_write(priv->regs, MTK_STAR_REG_MAC_CFG, val);
+
+       /* Enable Hash Table BIST and reset it */
+       ret = mtk_star_reset_hash_table(priv);
+       if (ret)
+               return ret;
+
+       /* Setup the hashing algorithm */
+       regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG,
+                          MTK_STAR_BIT_ARL_CFG_HASH_ALG |
+                          MTK_STAR_BIT_ARL_CFG_MISC_MODE, 0);
+
+       /* Don't strip VLAN tags */
+       regmap_update_bits(priv->regs, MTK_STAR_REG_MAC_CFG,
+                          MTK_STAR_BIT_MAC_CFG_VLAN_STRIP, 0);
+
+       /* Setup DMA */
+       mtk_star_dma_init(priv);
+
+       ret = mtk_star_prepare_rx_skbs(ndev);
+       if (ret)
+               goto err_out;
+
+       /* Request the interrupt */
+       ret = request_irq(ndev->irq, mtk_star_handle_irq,
+                         IRQF_TRIGGER_FALLING, ndev->name, ndev);
+       if (ret)
+               goto err_free_skbs;
+
+       napi_enable(&priv->napi);
+
+       mtk_star_intr_ack_all(priv);
+       mtk_star_intr_enable(priv);
+
+       /* Connect to and start PHY */
+       priv->phydev = of_phy_connect(ndev, priv->phy_node,
+                                     mtk_star_adjust_link, 0, priv->phy_intf);
+       if (!priv->phydev) {
+               netdev_err(ndev, "failed to connect to PHY\n");
+               goto err_free_irq;
+       }
+
+       mtk_star_dma_start(priv);
+       phy_start(priv->phydev);
+       netif_start_queue(ndev);
+
+       return 0;
+
+err_free_irq:
+       free_irq(ndev->irq, ndev);
+err_free_skbs:
+       mtk_star_free_rx_skbs(priv);
+err_out:
+       return ret;
+}
+
+static void mtk_star_disable(struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+
+       netif_stop_queue(ndev);
+       napi_disable(&priv->napi);
+       mtk_star_intr_disable(priv);
+       mtk_star_dma_disable(priv);
+       mtk_star_intr_ack_all(priv);
+       phy_stop(priv->phydev);
+       phy_disconnect(priv->phydev);
+       free_irq(ndev->irq, ndev);
+       mtk_star_free_rx_skbs(priv);
+       mtk_star_free_tx_skbs(priv);
+}
+
+static int mtk_star_netdev_open(struct net_device *ndev)
+{
+       return mtk_star_enable(ndev);
+}
+
+static int mtk_star_netdev_stop(struct net_device *ndev)
+{
+       mtk_star_disable(ndev);
+
+       return 0;
+}
+
+static int mtk_star_netdev_ioctl(struct net_device *ndev,
+                                struct ifreq *req, int cmd)
+{
+       if (!netif_running(ndev))
+               return -EINVAL;
+
+       return phy_mii_ioctl(ndev->phydev, req, cmd);
+}
+
+static int mtk_star_netdev_start_xmit(struct sk_buff *skb,
+                                     struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+       struct mtk_star_ring *ring = &priv->tx_ring;
+       struct device *dev = mtk_star_get_dev(priv);
+       struct mtk_star_ring_desc_data desc_data;
+
+       desc_data.dma_addr = mtk_star_dma_map_tx(priv, skb);
+       if (dma_mapping_error(dev, desc_data.dma_addr))
+               goto err_drop_packet;
+
+       desc_data.skb = skb;
+       desc_data.len = skb->len;
+
+       spin_lock_bh(&priv->lock);
+
+       mtk_star_ring_push_head_tx(ring, &desc_data);
+
+       netdev_sent_queue(ndev, skb->len);
+
+       if (mtk_star_ring_full(ring))
+               netif_stop_queue(ndev);
+
+       spin_unlock_bh(&priv->lock);
+
+       mtk_star_dma_resume_tx(priv);
+
+       return NETDEV_TX_OK;
+
+err_drop_packet:
+       dev_kfree_skb(skb);
+       ndev->stats.tx_dropped++;
+       return NETDEV_TX_BUSY;
+}
+
+/* Returns the number of bytes sent or a negative number on the first
+ * descriptor owned by DMA.
+ */
+static int mtk_star_tx_complete_one(struct mtk_star_priv *priv)
+{
+       struct mtk_star_ring *ring = &priv->tx_ring;
+       struct mtk_star_ring_desc_data desc_data;
+       int ret;
+
+       ret = mtk_star_ring_pop_tail(ring, &desc_data);
+       if (ret)
+               return ret;
+
+       mtk_star_dma_unmap_tx(priv, &desc_data);
+       ret = desc_data.skb->len;
+       dev_kfree_skb_irq(desc_data.skb);
+
+       return ret;
+}
+
+static void mtk_star_tx_complete_all(struct mtk_star_priv *priv)
+{
+       struct mtk_star_ring *ring = &priv->tx_ring;
+       struct net_device *ndev = priv->ndev;
+       int ret, pkts_compl, bytes_compl;
+       bool wake = false;
+
+       spin_lock(&priv->lock);
+
+       for (pkts_compl = 0, bytes_compl = 0;;
+            pkts_compl++, bytes_compl += ret, wake = true) {
+               if (!mtk_star_ring_descs_available(ring))
+                       break;
+
+               ret = mtk_star_tx_complete_one(priv);
+               if (ret < 0)
+                       break;
+       }
+
+       netdev_completed_queue(ndev, pkts_compl, bytes_compl);
+
+       if (wake && netif_queue_stopped(ndev))
+               netif_wake_queue(ndev);
+
+       mtk_star_intr_enable_tx(priv);
+
+       spin_unlock(&priv->lock);
+}
+
+static void mtk_star_netdev_get_stats64(struct net_device *ndev,
+                                       struct rtnl_link_stats64 *stats)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+
+       mtk_star_update_stats(priv);
+
+       memcpy(stats, &priv->stats, sizeof(*stats));
+}
+
+static void mtk_star_set_rx_mode(struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+       struct netdev_hw_addr *hw_addr;
+       unsigned int hash_addr, i;
+       int ret;
+
+       if (ndev->flags & IFF_PROMISC) {
+               regmap_update_bits(priv->regs, MTK_STAR_REG_ARL_CFG,
+                                  MTK_STAR_BIT_ARL_CFG_MISC_MODE,
+                                  MTK_STAR_BIT_ARL_CFG_MISC_MODE);
+       } else if (netdev_mc_count(ndev) > MTK_STAR_HASHTABLE_MC_LIMIT ||
+                  ndev->flags & IFF_ALLMULTI) {
+               for (i = 0; i < MTK_STAR_HASHTABLE_SIZE_MAX; i++) {
+                       ret = mtk_star_set_hashbit(priv, i);
+                       if (ret)
+                               goto hash_fail;
+               }
+       } else {
+               /* Clear previous settings. */
+               ret = mtk_star_reset_hash_table(priv);
+               if (ret)
+                       goto hash_fail;
+
+               netdev_for_each_mc_addr(hw_addr, ndev) {
+                       hash_addr = (hw_addr->addr[0] & 0x01) << 8;
+                       hash_addr += hw_addr->addr[5];
+                       ret = mtk_star_set_hashbit(priv, hash_addr);
+                       if (ret)
+                               goto hash_fail;
+               }
+       }
+
+       return;
+
+hash_fail:
+       if (ret == -ETIMEDOUT)
+               netdev_err(ndev, "setting hash bit timed out\n");
+       else
+               /* Should be -EIO */
+               netdev_err(ndev, "unable to set hash bit");
+}
+
+static const struct net_device_ops mtk_star_netdev_ops = {
+       .ndo_open               = mtk_star_netdev_open,
+       .ndo_stop               = mtk_star_netdev_stop,
+       .ndo_start_xmit         = mtk_star_netdev_start_xmit,
+       .ndo_get_stats64        = mtk_star_netdev_get_stats64,
+       .ndo_set_rx_mode        = mtk_star_set_rx_mode,
+       .ndo_do_ioctl           = mtk_star_netdev_ioctl,
+       .ndo_set_mac_address    = eth_mac_addr,
+       .ndo_validate_addr      = eth_validate_addr,
+};
+
+static void mtk_star_get_drvinfo(struct net_device *dev,
+                                struct ethtool_drvinfo *info)
+{
+       strlcpy(info->driver, MTK_STAR_DRVNAME, sizeof(info->driver));
+}
+
+/* TODO Add ethtool stats. */
+static const struct ethtool_ops mtk_star_ethtool_ops = {
+       .get_drvinfo            = mtk_star_get_drvinfo,
+       .get_link               = ethtool_op_get_link,
+       .get_link_ksettings     = phy_ethtool_get_link_ksettings,
+       .set_link_ksettings     = phy_ethtool_set_link_ksettings,
+};
+
+static int mtk_star_receive_packet(struct mtk_star_priv *priv)
+{
+       struct mtk_star_ring *ring = &priv->rx_ring;
+       struct device *dev = mtk_star_get_dev(priv);
+       struct mtk_star_ring_desc_data desc_data;
+       struct net_device *ndev = priv->ndev;
+       struct sk_buff *curr_skb, *new_skb;
+       dma_addr_t new_dma_addr;
+       int ret;
+
+       spin_lock(&priv->lock);
+       ret = mtk_star_ring_pop_tail(ring, &desc_data);
+       spin_unlock(&priv->lock);
+       if (ret)
+               return -1;
+
+       curr_skb = desc_data.skb;
+
+       if ((desc_data.flags & MTK_STAR_DESC_BIT_RX_CRCE) ||
+           (desc_data.flags & MTK_STAR_DESC_BIT_RX_OSIZE)) {
+               /* Error packet -> drop and reuse skb. */
+               new_skb = curr_skb;
+               goto push_new_skb;
+       }
+
+       /* Prepare new skb before receiving the current one. Reuse the current
+        * skb if we fail at any point.
+        */
+       new_skb = mtk_star_alloc_skb(ndev);
+       if (!new_skb) {
+               ndev->stats.rx_dropped++;
+               new_skb = curr_skb;
+               goto push_new_skb;
+       }
+
+       new_dma_addr = mtk_star_dma_map_rx(priv, new_skb);
+       if (dma_mapping_error(dev, new_dma_addr)) {
+               ndev->stats.rx_dropped++;
+               dev_kfree_skb(new_skb);
+               new_skb = curr_skb;
+               netdev_err(ndev, "DMA mapping error of RX descriptor\n");
+               goto push_new_skb;
+       }
+
+       /* We can't fail anymore at this point: it's safe to unmap the skb. */
+       mtk_star_dma_unmap_rx(priv, &desc_data);
+
+       skb_put(desc_data.skb, desc_data.len);
+       desc_data.skb->ip_summed = CHECKSUM_NONE;
+       desc_data.skb->protocol = eth_type_trans(desc_data.skb, ndev);
+       desc_data.skb->dev = ndev;
+       netif_receive_skb(desc_data.skb);
+
+push_new_skb:
+       desc_data.dma_addr = new_dma_addr;
+       desc_data.len = skb_tailroom(new_skb);
+       desc_data.skb = new_skb;
+
+       spin_lock(&priv->lock);
+       mtk_star_ring_push_head_rx(ring, &desc_data);
+       spin_unlock(&priv->lock);
+
+       return 0;
+}
+
+static int mtk_star_process_rx(struct mtk_star_priv *priv, int budget)
+{
+       int received, ret;
+
+       for (received = 0, ret = 0; received < budget && ret == 0; received++)
+               ret = mtk_star_receive_packet(priv);
+
+       mtk_star_dma_resume_rx(priv);
+
+       return received;
+}
+
+static int mtk_star_poll(struct napi_struct *napi, int budget)
+{
+       struct mtk_star_priv *priv;
+       int received = 0;
+
+       priv = container_of(napi, struct mtk_star_priv, napi);
+
+       /* Clean-up all TX descriptors. */
+       mtk_star_tx_complete_all(priv);
+       /* Receive up to $budget packets. */
+       received = mtk_star_process_rx(priv, budget);
+
+       if (received < budget) {
+               napi_complete_done(napi, received);
+               mtk_star_intr_enable_rx(priv);
+       }
+
+       return received;
+}
+
+static void mtk_star_mdio_rwok_clear(struct mtk_star_priv *priv)
+{
+       regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0,
+                    MTK_STAR_BIT_PHY_CTRL0_RWOK);
+}
+
+static int mtk_star_mdio_rwok_wait(struct mtk_star_priv *priv)
+{
+       unsigned int val;
+
+       return regmap_read_poll_timeout(priv->regs, MTK_STAR_REG_PHY_CTRL0,
+                                       val, val & MTK_STAR_BIT_PHY_CTRL0_RWOK,
+                                       10, MTK_STAR_WAIT_TIMEOUT);
+}
+
+static int mtk_star_mdio_read(struct mii_bus *mii, int phy_id, int regnum)
+{
+       struct mtk_star_priv *priv = mii->priv;
+       unsigned int val, data;
+       int ret;
+
+       if (regnum & MII_ADDR_C45)
+               return -EOPNOTSUPP;
+
+       mtk_star_mdio_rwok_clear(priv);
+
+       val = (regnum << MTK_STAR_OFF_PHY_CTRL0_PREG);
+       val &= MTK_STAR_MSK_PHY_CTRL0_PREG;
+       val |= MTK_STAR_BIT_PHY_CTRL0_RDCMD;
+
+       regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0, val);
+
+       ret = mtk_star_mdio_rwok_wait(priv);
+       if (ret)
+               return ret;
+
+       regmap_read(priv->regs, MTK_STAR_REG_PHY_CTRL0, &data);
+
+       data &= MTK_STAR_MSK_PHY_CTRL0_RWDATA;
+       data >>= MTK_STAR_OFF_PHY_CTRL0_RWDATA;
+
+       return data;
+}
+
+static int mtk_star_mdio_write(struct mii_bus *mii, int phy_id,
+                              int regnum, u16 data)
+{
+       struct mtk_star_priv *priv = mii->priv;
+       unsigned int val;
+
+       if (regnum & MII_ADDR_C45)
+               return -EOPNOTSUPP;
+
+       mtk_star_mdio_rwok_clear(priv);
+
+       val = data;
+       val <<= MTK_STAR_OFF_PHY_CTRL0_RWDATA;
+       val &= MTK_STAR_MSK_PHY_CTRL0_RWDATA;
+       regnum <<= MTK_STAR_OFF_PHY_CTRL0_PREG;
+       regnum &= MTK_STAR_MSK_PHY_CTRL0_PREG;
+       val |= regnum;
+       val |= MTK_STAR_BIT_PHY_CTRL0_WTCMD;
+
+       regmap_write(priv->regs, MTK_STAR_REG_PHY_CTRL0, val);
+
+       return mtk_star_mdio_rwok_wait(priv);
+}
+
+static int mtk_star_mdio_init(struct net_device *ndev)
+{
+       struct mtk_star_priv *priv = netdev_priv(ndev);
+       struct device *dev = mtk_star_get_dev(priv);
+       struct device_node *of_node, *mdio_node;
+       int ret;
+
+       of_node = dev->of_node;
+
+       mdio_node = of_get_child_by_name(of_node, "mdio");
+       if (!mdio_node)
+               return -ENODEV;
+
+       if (!of_device_is_available(mdio_node)) {
+               ret = -ENODEV;
+               goto out_put_node;
+       }
+
+       priv->mii = devm_mdiobus_alloc(dev);
+       if (!priv->mii) {
+               ret = -ENOMEM;
+               goto out_put_node;
+       }
+
+       snprintf(priv->mii->id, MII_BUS_ID_SIZE, "%s", dev_name(dev));
+       priv->mii->name = "mtk-mac-mdio";
+       priv->mii->parent = dev;
+       priv->mii->read = mtk_star_mdio_read;
+       priv->mii->write = mtk_star_mdio_write;
+       priv->mii->priv = priv;
+
+       ret = of_mdiobus_register(priv->mii, mdio_node);
+
+out_put_node:
+       of_node_put(mdio_node);
+       return ret;
+}
+
+static int mtk_star_suspend(struct device *dev)
+{
+       struct mtk_star_priv *priv;
+       struct net_device *ndev;
+
+       ndev = dev_get_drvdata(dev);
+       priv = netdev_priv(ndev);
+
+       if (netif_running(ndev))
+               mtk_star_disable(ndev);
+
+       clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks);
+
+       return 0;
+}
+
+static int mtk_star_resume(struct device *dev)
+{
+       struct mtk_star_priv *priv;
+       struct net_device *ndev;
+       int ret;
+
+       ndev = dev_get_drvdata(dev);
+       priv = netdev_priv(ndev);
+
+       ret = clk_bulk_prepare_enable(MTK_STAR_NCLKS, priv->clks);
+       if (ret)
+               return ret;
+
+       if (netif_running(ndev)) {
+               ret = mtk_star_enable(ndev);
+               if (ret)
+                       clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks);
+       }
+
+       return ret;
+}
+
+static void mtk_star_clk_disable_unprepare(void *data)
+{
+       struct mtk_star_priv *priv = data;
+
+       clk_bulk_disable_unprepare(MTK_STAR_NCLKS, priv->clks);
+}
+
+static void mtk_star_mdiobus_unregister(void *data)
+{
+       struct mtk_star_priv *priv = data;
+
+       mdiobus_unregister(priv->mii);
+}
+
+static void mtk_star_unregister_netdev(void *data)
+{
+       struct net_device *ndev = data;
+
+       unregister_netdev(ndev);
+}
+
+static int mtk_star_probe(struct platform_device *pdev)
+{
+       struct device_node *of_node;
+       struct mtk_star_priv *priv;
+       struct net_device *ndev;
+       struct device *dev;
+       void __iomem *base;
+       int ret, i;
+
+       dev = &pdev->dev;
+       of_node = dev->of_node;
+
+       ndev = devm_alloc_etherdev(dev, sizeof(*priv));
+       if (!ndev)
+               return -ENOMEM;
+
+       priv = netdev_priv(ndev);
+       priv->ndev = ndev;
+       SET_NETDEV_DEV(ndev, dev);
+       platform_set_drvdata(pdev, ndev);
+
+       ndev->min_mtu = ETH_ZLEN;
+       ndev->max_mtu = MTK_STAR_MAX_FRAME_SIZE;
+
+       spin_lock_init(&priv->lock);
+       INIT_WORK(&priv->stats_work, mtk_star_update_stats_work);
+
+       base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(base))
+               return PTR_ERR(base);
+
+       /* We won't be checking the return values of regmap read & write
+        * functions. They can only fail for mmio if there's a clock attached
+        * to regmap which is not the case here.
+        */
+       priv->regs = devm_regmap_init_mmio(dev, base,
+                                          &mtk_star_regmap_config);
+       if (IS_ERR(priv->regs))
+               return PTR_ERR(priv->regs);
+
+       priv->pericfg = syscon_regmap_lookup_by_phandle(of_node,
+                                                       "mediatek,pericfg");
+       if (IS_ERR(priv->pericfg)) {
+               dev_err(dev, "Failed to lookup the PERICFG syscon\n");
+               return PTR_ERR(priv->pericfg);
+       }
+
+       ndev->irq = platform_get_irq(pdev, 0);
+       if (ndev->irq < 0)
+               return ndev->irq;
+
+       for (i = 0; i < MTK_STAR_NCLKS; i++)
+               priv->clks[i].id = mtk_star_clk_names[i];
+       ret = devm_clk_bulk_get(dev, MTK_STAR_NCLKS, priv->clks);
+       if (ret)
+               return ret;
+
+       ret = clk_bulk_prepare_enable(MTK_STAR_NCLKS, priv->clks);
+       if (ret)
+               return ret;
+
+       ret = devm_add_action_or_reset(dev,
+                                      mtk_star_clk_disable_unprepare, priv);
+       if (ret)
+               return ret;
+
+       ret = of_get_phy_mode(of_node, &priv->phy_intf);
+       if (ret) {
+               return ret;
+       } else if (priv->phy_intf != PHY_INTERFACE_MODE_RMII) {
+               dev_err(dev, "unsupported phy mode: %s\n",
+                       phy_modes(priv->phy_intf));
+               return -EINVAL;
+       }
+
+       priv->phy_node = of_parse_phandle(of_node, "phy-handle", 0);
+       if (!priv->phy_node) {
+               dev_err(dev, "failed to retrieve the phy handle from device tree\n");
+               return -ENODEV;
+       }
+
+       mtk_star_set_mode_rmii(priv);
+
+       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+       if (ret) {
+               dev_err(dev, "unsupported DMA mask\n");
+               return ret;
+       }
+
+       priv->ring_base = dmam_alloc_coherent(dev, MTK_STAR_DMA_SIZE,
+                                             &priv->dma_addr,
+                                             GFP_KERNEL | GFP_DMA);
+       if (!priv->ring_base)
+               return -ENOMEM;
+
+       mtk_star_nic_disable_pd(priv);
+       mtk_star_init_config(priv);
+
+       ret = mtk_star_mdio_init(ndev);
+       if (ret)
+               return ret;
+
+       ret = devm_add_action_or_reset(dev, mtk_star_mdiobus_unregister, priv);
+       if (ret)
+               return ret;
+
+       ret = eth_platform_get_mac_address(dev, ndev->dev_addr);
+       if (ret || !is_valid_ether_addr(ndev->dev_addr))
+               eth_hw_addr_random(ndev);
+
+       ndev->netdev_ops = &mtk_star_netdev_ops;
+       ndev->ethtool_ops = &mtk_star_ethtool_ops;
+
+       netif_napi_add(ndev, &priv->napi, mtk_star_poll, MTK_STAR_NAPI_WEIGHT);
+
+       ret = register_netdev(ndev);
+       if (ret)
+               return ret;
+
+       ret = devm_add_action_or_reset(dev, mtk_star_unregister_netdev, ndev);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+static const struct of_device_id mtk_star_of_match[] = {
+       { .compatible = "mediatek,mt8516-eth", },
+       { .compatible = "mediatek,mt8518-eth", },
+       { .compatible = "mediatek,mt8175-eth", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, mtk_star_of_match);
+
+static SIMPLE_DEV_PM_OPS(mtk_star_pm_ops,
+                        mtk_star_suspend, mtk_star_resume);
+
+static struct platform_driver mtk_star_driver = {
+       .driver = {
+               .name = MTK_STAR_DRVNAME,
+               .pm = &mtk_star_pm_ops,
+               .of_match_table = of_match_ptr(mtk_star_of_match),
+       },
+       .probe = mtk_star_probe,
+};
+module_platform_driver(mtk_star_driver);
+
+MODULE_AUTHOR("Bartosz Golaszewski <bgolaszewski@baylibre.com>");
+MODULE_DESCRIPTION("Mediatek STAR Ethernet MAC Driver");
+MODULE_LICENSE("GPL");
index 7d69a3061f1789d0804cff1f0c7f612ff02ed465..4256d59eca2b3b44f2b616652bbdf26851a35bdf 100644 (file)
@@ -78,9 +78,24 @@ config MLX5_ESWITCH
                Legacy SRIOV mode (L2 mac vlan steering based).
                Switchdev mode (eswitch offloads).
 
+config MLX5_CLS_ACT
+       bool "MLX5 TC classifier action support"
+       depends on MLX5_ESWITCH && NET_CLS_ACT
+       default y
+       help
+         mlx5 ConnectX offloads support for TC classifier action (NET_CLS_ACT),
+         works in both native NIC mdoe and Switchdev SRIOV mode.
+         Actions get attached to a Hardware offloaded classifiers and are
+         invoked after a successful classification. Actions are used to
+         overwrite the classification result, instantly drop or redirect and/or
+         reformat packets in wire speeds without involving the host cpu.
+
+         If set to N, TC offloads in both NIC and switchdev modes will be disabled.
+         If unsure, set to Y
+
 config MLX5_TC_CT
        bool "MLX5 TC connection tracking offload support"
-       depends on MLX5_CORE_EN && NET_SWITCHDEV && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT
+       depends on MLX5_CLS_ACT && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT
        default y
        help
          Say Y here if you want to support offloading connection tracking rules
index d3c7dbd7f1d561acead7f33b149f3a05684611cc..e5ee9103fefbb1d4e266e7a9527a10780d85e2fd 100644 (file)
@@ -33,17 +33,19 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 mlx5_core-$(CONFIG_MLX5_EN_ARFS)     += en_arfs.o
 mlx5_core-$(CONFIG_MLX5_EN_RXNFC)    += en_fs_ethtool.o
 mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
-mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \
-                                       lib/geneve.o en/mapping.o en/tc_tun_vxlan.o en/tc_tun_gre.o \
-                                       en/tc_tun_geneve.o diag/en_tc_tracepoint.o
 mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o lib/geneve.o lib/port_tun.o lag_mp.o
+mlx5_core-$(CONFIG_MLX5_CLS_ACT)     += en_tc.o en/rep/tc.o en/rep/neigh.o \
+                                       en/mapping.o esw/chains.o en/tc_tun.o \
+                                       en/tc_tun_vxlan.o en/tc_tun_gre.o en/tc_tun_geneve.o \
+                                       en/tc_tun_mplsoudp.o diag/en_tc_tracepoint.o
 mlx5_core-$(CONFIG_MLX5_TC_CT)      += en/tc_ct.o
 
 #
 # Core extra
 #
 mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \
-                                     ecpf.o rdma.o esw/chains.o
+                                     ecpf.o rdma.o
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
index 81fd53569463fc25dcdea83e28ad984621523add..4906aee6798d207fd22a6319be39f35859f81bc4 100644 (file)
@@ -365,10 +365,7 @@ struct mlx5e_dma_info {
        dma_addr_t addr;
        union {
                struct page *page;
-               struct {
-                       u64 handle;
-                       void *data;
-               } xsk;
+               struct xdp_buff *xsk;
        };
 };
 
@@ -581,7 +578,6 @@ struct mlx5e_rq {
                } mpwqe;
        };
        struct {
-               u16            umem_headroom;
                u16            headroom;
                u32            frame0_sz;
                u8             map_dir;   /* dma map direction */
@@ -614,7 +610,6 @@ struct mlx5e_rq {
        struct page_pool      *page_pool;
 
        /* AF_XDP zero-copy */
-       struct zero_copy_allocator zca;
        struct xdp_umem       *umem;
 
        struct work_struct     recover_work;
index eb2e1f2138e458eab9edb29b88932b2b84e9dad5..38e4f19d69f86d108d1f21a4aabed405fd4f2722 100644 (file)
@@ -12,15 +12,16 @@ static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params,
 u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
                                 struct mlx5e_xsk_param *xsk)
 {
-       u16 headroom = NET_IP_ALIGN;
+       u16 headroom;
 
-       if (mlx5e_rx_is_xdp(params, xsk)) {
+       if (xsk)
+               return xsk->headroom;
+
+       headroom = NET_IP_ALIGN;
+       if (mlx5e_rx_is_xdp(params, xsk))
                headroom += XDP_PACKET_HEADROOM;
-               if (xsk)
-                       headroom += xsk->headroom;
-       } else {
+       else
                headroom += MLX5_RX_HEADROOM;
-       }
 
        return headroom;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c
new file mode 100644 (file)
index 0000000..baa1624
--- /dev/null
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+#include <linux/workqueue.h>
+#include <linux/rwlock.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <net/netevent.h>
+#include "neigh.h"
+#include "tc.h"
+#include "en_rep.h"
+#include "fs_core.h"
+#include "diag/en_rep_tracepoint.h"
+
+static unsigned long mlx5e_rep_ipv6_interval(void)
+{
+       if (IS_ENABLED(CONFIG_IPV6) && ipv6_stub->nd_tbl)
+               return NEIGH_VAR(&ipv6_stub->nd_tbl->parms, DELAY_PROBE_TIME);
+
+       return ~0UL;
+}
+
+static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
+{
+       unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME);
+       unsigned long ipv6_interval = mlx5e_rep_ipv6_interval();
+       struct net_device *netdev = rpriv->netdev;
+       struct mlx5e_priv *priv = netdev_priv(netdev);
+
+       rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
+       mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
+}
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+       mlx5_fc_queue_stats_work(priv->mdev,
+                                &neigh_update->neigh_stats_work,
+                                neigh_update->min_interval);
+}
+
+static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
+{
+       return refcount_inc_not_zero(&nhe->refcnt);
+}
+
+static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe);
+
+void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
+{
+       if (refcount_dec_and_test(&nhe->refcnt)) {
+               mlx5e_rep_neigh_entry_remove(nhe);
+               kfree_rcu(nhe, rcu);
+       }
+}
+
+static struct mlx5e_neigh_hash_entry *
+mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv,
+                  struct mlx5e_neigh_hash_entry *nhe)
+{
+       struct mlx5e_neigh_hash_entry *next = NULL;
+
+       rcu_read_lock();
+
+       for (next = nhe ?
+                    list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
+                                          &nhe->neigh_list,
+                                          struct mlx5e_neigh_hash_entry,
+                                          neigh_list) :
+                    list_first_or_null_rcu(&rpriv->neigh_update.neigh_list,
+                                           struct mlx5e_neigh_hash_entry,
+                                           neigh_list);
+            next;
+            next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
+                                         &next->neigh_list,
+                                         struct mlx5e_neigh_hash_entry,
+                                         neigh_list))
+               if (mlx5e_rep_neigh_entry_hold(next))
+                       break;
+
+       rcu_read_unlock();
+
+       if (nhe)
+               mlx5e_rep_neigh_entry_release(nhe);
+
+       return next;
+}
+
+static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
+{
+       struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
+                                                   neigh_update.neigh_stats_work.work);
+       struct net_device *netdev = rpriv->netdev;
+       struct mlx5e_priv *priv = netdev_priv(netdev);
+       struct mlx5e_neigh_hash_entry *nhe = NULL;
+
+       rtnl_lock();
+       if (!list_empty(&rpriv->neigh_update.neigh_list))
+               mlx5e_rep_queue_neigh_stats_work(priv);
+
+       while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL)
+               mlx5e_tc_update_neigh_used_value(nhe);
+
+       rtnl_unlock();
+}
+
+static void mlx5e_rep_neigh_update(struct work_struct *work)
+{
+       struct mlx5e_neigh_hash_entry *nhe =
+               container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work);
+       struct neighbour *n = nhe->n;
+       struct mlx5e_encap_entry *e;
+       unsigned char ha[ETH_ALEN];
+       struct mlx5e_priv *priv;
+       bool neigh_connected;
+       u8 nud_state, dead;
+
+       rtnl_lock();
+
+       /* If these parameters are changed after we release the lock,
+        * we'll receive another event letting us know about it.
+        * We use this lock to avoid inconsistency between the neigh validity
+        * and it's hw address.
+        */
+       read_lock_bh(&n->lock);
+       memcpy(ha, n->ha, ETH_ALEN);
+       nud_state = n->nud_state;
+       dead = n->dead;
+       read_unlock_bh(&n->lock);
+
+       neigh_connected = (nud_state & NUD_VALID) && !dead;
+
+       trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected);
+
+       list_for_each_entry(e, &nhe->encap_list, encap_list) {
+               if (!mlx5e_encap_take(e))
+                       continue;
+
+               priv = netdev_priv(e->out_dev);
+               mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
+               mlx5e_encap_put(priv, e);
+       }
+       mlx5e_rep_neigh_entry_release(nhe);
+       rtnl_unlock();
+       neigh_release(n);
+}
+
+static void mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv,
+                                             struct mlx5e_neigh_hash_entry *nhe,
+                                             struct neighbour *n)
+{
+       /* Take a reference to ensure the neighbour and mlx5 encap
+        * entry won't be destructed until we drop the reference in
+        * delayed work.
+        */
+       neigh_hold(n);
+
+       /* This assignment is valid as long as the the neigh reference
+        * is taken
+        */
+       nhe->n = n;
+
+       if (!queue_work(priv->wq, &nhe->neigh_update_work)) {
+               mlx5e_rep_neigh_entry_release(nhe);
+               neigh_release(n);
+       }
+}
+
+static int mlx5e_rep_netevent_event(struct notifier_block *nb,
+                                   unsigned long event, void *ptr)
+{
+       struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
+                                                   neigh_update.netevent_nb);
+       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+       struct net_device *netdev = rpriv->netdev;
+       struct mlx5e_priv *priv = netdev_priv(netdev);
+       struct mlx5e_neigh_hash_entry *nhe = NULL;
+       struct mlx5e_neigh m_neigh = {};
+       struct neigh_parms *p;
+       struct neighbour *n;
+       bool found = false;
+
+       switch (event) {
+       case NETEVENT_NEIGH_UPDATE:
+               n = ptr;
+#if IS_ENABLED(CONFIG_IPV6)
+               if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl)
+#else
+               if (n->tbl != &arp_tbl)
+#endif
+                       return NOTIFY_DONE;
+
+               m_neigh.dev = n->dev;
+               m_neigh.family = n->ops->family;
+               memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+
+               rcu_read_lock();
+               nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh);
+               rcu_read_unlock();
+               if (!nhe)
+                       return NOTIFY_DONE;
+
+               mlx5e_rep_queue_neigh_update_work(priv, nhe, n);
+               break;
+
+       case NETEVENT_DELAY_PROBE_TIME_UPDATE:
+               p = ptr;
+
+               /* We check the device is present since we don't care about
+                * changes in the default table, we only care about changes
+                * done per device delay prob time parameter.
+                */
+#if IS_ENABLED(CONFIG_IPV6)
+               if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != &arp_tbl))
+#else
+               if (!p->dev || p->tbl != &arp_tbl)
+#endif
+                       return NOTIFY_DONE;
+
+               rcu_read_lock();
+               list_for_each_entry_rcu(nhe, &neigh_update->neigh_list,
+                                       neigh_list) {
+                       if (p->dev == nhe->m_neigh.dev) {
+                               found = true;
+                               break;
+                       }
+               }
+               rcu_read_unlock();
+               if (!found)
+                       return NOTIFY_DONE;
+
+               neigh_update->min_interval = min_t(unsigned long,
+                                                  NEIGH_VAR(p, DELAY_PROBE_TIME),
+                                                  neigh_update->min_interval);
+               mlx5_fc_update_sampling_interval(priv->mdev,
+                                                neigh_update->min_interval);
+               break;
+       }
+       return NOTIFY_DONE;
+}
+
+static const struct rhashtable_params mlx5e_neigh_ht_params = {
+       .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
+       .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
+       .key_len = sizeof(struct mlx5e_neigh),
+       .automatic_shrinking = true,
+};
+
+int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
+{
+       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+       int err;
+
+       err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
+       if (err)
+               return err;
+
+       INIT_LIST_HEAD(&neigh_update->neigh_list);
+       mutex_init(&neigh_update->encap_lock);
+       INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
+                         mlx5e_rep_neigh_stats_work);
+       mlx5e_rep_neigh_update_init_interval(rpriv);
+
+       rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
+       err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
+       if (err)
+               goto out_err;
+       return 0;
+
+out_err:
+       rhashtable_destroy(&neigh_update->neigh_ht);
+       return err;
+}
+
+void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+       struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+       unregister_netevent_notifier(&neigh_update->netevent_nb);
+
+       flush_workqueue(priv->wq); /* flush neigh update works */
+
+       cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
+
+       mutex_destroy(&neigh_update->encap_lock);
+       rhashtable_destroy(&neigh_update->neigh_ht);
+}
+
+static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv,
+                                       struct mlx5e_neigh_hash_entry *nhe)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+       int err;
+
+       err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht,
+                                    &nhe->rhash_node,
+                                    mlx5e_neigh_ht_params);
+       if (err)
+               return err;
+
+       list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list);
+
+       return err;
+}
+
+static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe)
+{
+       struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv;
+
+       mutex_lock(&rpriv->neigh_update.encap_lock);
+
+       list_del_rcu(&nhe->neigh_list);
+
+       rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
+                              &nhe->rhash_node,
+                              mlx5e_neigh_ht_params);
+       mutex_unlock(&rpriv->neigh_update.encap_lock);
+}
+
+/* This function must only be called under the representor's encap_lock or
+ * inside rcu read lock section.
+ */
+struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+                            struct mlx5e_neigh *m_neigh)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+       struct mlx5e_neigh_hash_entry *nhe;
+
+       nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh,
+                                    mlx5e_neigh_ht_params);
+       return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL;
+}
+
+int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
+                                struct mlx5e_encap_entry *e,
+                                struct mlx5e_neigh_hash_entry **nhe)
+{
+       int err;
+
+       *nhe = kzalloc(sizeof(**nhe), GFP_KERNEL);
+       if (!*nhe)
+               return -ENOMEM;
+
+       (*nhe)->priv = priv;
+       memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh));
+       INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update);
+       spin_lock_init(&(*nhe)->encap_list_lock);
+       INIT_LIST_HEAD(&(*nhe)->encap_list);
+       refcount_set(&(*nhe)->refcnt, 1);
+
+       err = mlx5e_rep_neigh_entry_insert(priv, *nhe);
+       if (err)
+               goto out_free;
+       return 0;
+
+out_free:
+       kfree(*nhe);
+       return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h
new file mode 100644 (file)
index 0000000..32b2391
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_REP_NEIGH__
+#define __MLX5_EN_REP_NEIGH__
+
+#include "en.h"
+#include "en_rep.h"
+
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
+int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv);
+void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv);
+
+struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+                            struct mlx5e_neigh *m_neigh);
+int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
+                                struct mlx5e_encap_entry *e,
+                                struct mlx5e_neigh_hash_entry **nhe);
+void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe);
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
+
+#else /* CONFIG_MLX5_CLS_ACT */
+
+static inline int
+mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) { return 0; }
+static inline void
+mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) {}
+
+#endif /* CONFIG_MLX5_CLS_ACT */
+
+#endif /* __MLX5_EN_REP_NEIGH__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
new file mode 100644 (file)
index 0000000..c609a5e
--- /dev/null
@@ -0,0 +1,711 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#include <net/dst_metadata.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include "tc.h"
+#include "neigh.h"
+#include "en_rep.h"
+#include "eswitch.h"
+#include "esw/chains.h"
+#include "en/tc_ct.h"
+#include "en/mapping.h"
+#include "en/tc_tun.h"
+#include "lib/port_tun.h"
+
+struct mlx5e_rep_indr_block_priv {
+       struct net_device *netdev;
+       struct mlx5e_rep_priv *rpriv;
+
+       struct list_head list;
+};
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+                                struct mlx5e_encap_entry *e)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+       struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
+       struct mlx5e_neigh_hash_entry *nhe;
+       int err;
+
+       err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type);
+       if (err)
+               return err;
+
+       mutex_lock(&rpriv->neigh_update.encap_lock);
+       nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
+       if (!nhe) {
+               err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
+               if (err) {
+                       mutex_unlock(&rpriv->neigh_update.encap_lock);
+                       mlx5_tun_entropy_refcount_dec(tun_entropy,
+                                                     e->reformat_type);
+                       return err;
+               }
+       }
+
+       e->nhe = nhe;
+       spin_lock(&nhe->encap_list_lock);
+       list_add_rcu(&e->encap_list, &nhe->encap_list);
+       spin_unlock(&nhe->encap_list_lock);
+
+       mutex_unlock(&rpriv->neigh_update.encap_lock);
+
+       return 0;
+}
+
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+                                 struct mlx5e_encap_entry *e)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+       struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
+
+       if (!e->nhe)
+               return;
+
+       spin_lock(&e->nhe->encap_list_lock);
+       list_del_rcu(&e->encap_list);
+       spin_unlock(&e->nhe->encap_list_lock);
+
+       mlx5e_rep_neigh_entry_release(e->nhe);
+       e->nhe = NULL;
+       mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type);
+}
+
+void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
+                           struct mlx5e_encap_entry *e,
+                           bool neigh_connected,
+                           unsigned char ha[ETH_ALEN])
+{
+       struct ethhdr *eth = (struct ethhdr *)e->encap_header;
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+       bool encap_connected;
+       LIST_HEAD(flow_list);
+
+       ASSERT_RTNL();
+
+       /* wait for encap to be fully initialized */
+       wait_for_completion(&e->res_ready);
+
+       mutex_lock(&esw->offloads.encap_tbl_lock);
+       encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
+       if (e->compl_result < 0 || (encap_connected == neigh_connected &&
+                                   ether_addr_equal(e->h_dest, ha)))
+               goto unlock;
+
+       mlx5e_take_all_encap_flows(e, &flow_list);
+
+       if ((e->flags & MLX5_ENCAP_ENTRY_VALID) &&
+           (!neigh_connected || !ether_addr_equal(e->h_dest, ha)))
+               mlx5e_tc_encap_flows_del(priv, e, &flow_list);
+
+       if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
+               ether_addr_copy(e->h_dest, ha);
+               ether_addr_copy(eth->h_dest, ha);
+               /* Update the encap source mac, in case that we delete
+                * the flows when encap source mac changed.
+                */
+               ether_addr_copy(eth->h_source, e->route_dev->dev_addr);
+
+               mlx5e_tc_encap_flows_add(priv, e, &flow_list);
+       }
+unlock:
+       mutex_unlock(&esw->offloads.encap_tbl_lock);
+       mlx5e_put_encap_flow_list(priv, &flow_list);
+}
+
+static int
+mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
+                             struct flow_cls_offload *cls_flower, int flags)
+{
+       switch (cls_flower->command) {
+       case FLOW_CLS_REPLACE:
+               return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
+                                             flags);
+       case FLOW_CLS_DESTROY:
+               return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
+                                          flags);
+       case FLOW_CLS_STATS:
+               return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
+                                         flags);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static
+int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv,
+                                   struct tc_cls_matchall_offload *ma)
+{
+       switch (ma->command) {
+       case TC_CLSMATCHALL_REPLACE:
+               return mlx5e_tc_configure_matchall(priv, ma);
+       case TC_CLSMATCHALL_DESTROY:
+               return mlx5e_tc_delete_matchall(priv, ma);
+       case TC_CLSMATCHALL_STATS:
+               mlx5e_tc_stats_matchall(priv, ma);
+               return 0;
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
+                                void *cb_priv)
+{
+       unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
+       struct mlx5e_priv *priv = cb_priv;
+
+       switch (type) {
+       case TC_SETUP_CLSFLOWER:
+               return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags);
+       case TC_SETUP_CLSMATCHALL:
+               return mlx5e_rep_setup_tc_cls_matchall(priv, type_data);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data,
+                                void *cb_priv)
+{
+       struct flow_cls_offload tmp, *f = type_data;
+       struct mlx5e_priv *priv = cb_priv;
+       struct mlx5_eswitch *esw;
+       unsigned long flags;
+       int err;
+
+       flags = MLX5_TC_FLAG(INGRESS) |
+               MLX5_TC_FLAG(ESW_OFFLOAD) |
+               MLX5_TC_FLAG(FT_OFFLOAD);
+       esw = priv->mdev->priv.eswitch;
+
+       switch (type) {
+       case TC_SETUP_CLSFLOWER:
+               memcpy(&tmp, f, sizeof(*f));
+
+               if (!mlx5_esw_chains_prios_supported(esw))
+                       return -EOPNOTSUPP;
+
+               /* Re-use tc offload path by moving the ft flow to the
+                * reserved ft chain.
+                *
+                * FT offload can use prio range [0, INT_MAX], so we normalize
+                * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
+                * as with tc, where prio 0 isn't supported.
+                *
+                * We only support chain 0 of FT offload.
+                */
+               if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw))
+                       return -EOPNOTSUPP;
+               if (tmp.common.chain_index != 0)
+                       return -EOPNOTSUPP;
+
+               tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
+               tmp.common.prio++;
+               err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags);
+               memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
+               return err;
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static LIST_HEAD(mlx5e_rep_block_tc_cb_list);
+static LIST_HEAD(mlx5e_rep_block_ft_cb_list);
+int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+                      void *type_data)
+{
+       struct mlx5e_priv *priv = netdev_priv(dev);
+       struct flow_block_offload *f = type_data;
+
+       f->unlocked_driver_cb = true;
+
+       switch (type) {
+       case TC_SETUP_BLOCK:
+               return flow_block_cb_setup_simple(type_data,
+                                                 &mlx5e_rep_block_tc_cb_list,
+                                                 mlx5e_rep_setup_tc_cb,
+                                                 priv, priv, true);
+       case TC_SETUP_FT:
+               return flow_block_cb_setup_simple(type_data,
+                                                 &mlx5e_rep_block_ft_cb_list,
+                                                 mlx5e_rep_setup_ft_cb,
+                                                 priv, priv, true);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv)
+{
+       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+       int err;
+
+       mutex_init(&uplink_priv->unready_flows_lock);
+       INIT_LIST_HEAD(&uplink_priv->unready_flows);
+
+       /* init shared tc flow table */
+       err = mlx5e_tc_esw_init(&uplink_priv->tc_ht);
+       return err;
+}
+
+void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+       /* delete shared tc flow table */
+       mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht);
+       mutex_destroy(&rpriv->uplink_priv.unready_flows_lock);
+}
+
+void mlx5e_rep_tc_enable(struct mlx5e_priv *priv)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+       INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work,
+                 mlx5e_tc_reoffload_flows_work);
+}
+
+void mlx5e_rep_tc_disable(struct mlx5e_priv *priv)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+       cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
+}
+
+int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv)
+{
+       struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+       queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work);
+
+       return NOTIFY_OK;
+}
+
+static struct mlx5e_rep_indr_block_priv *
+mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv,
+                                struct net_device *netdev)
+{
+       struct mlx5e_rep_indr_block_priv *cb_priv;
+
+       /* All callback list access should be protected by RTNL. */
+       ASSERT_RTNL();
+
+       list_for_each_entry(cb_priv,
+                           &rpriv->uplink_priv.tc_indr_block_priv_list,
+                           list)
+               if (cb_priv->netdev == netdev)
+                       return cb_priv;
+
+       return NULL;
+}
+
+static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
+                                           struct net_device *netdev);
+
+void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv)
+{
+       struct mlx5e_rep_indr_block_priv *cb_priv, *temp;
+       struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list;
+
+       list_for_each_entry_safe(cb_priv, temp, head, list) {
+               mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev);
+               kfree(cb_priv);
+       }
+}
+
+static int
+mlx5e_rep_indr_offload(struct net_device *netdev,
+                      struct flow_cls_offload *flower,
+                      struct mlx5e_rep_indr_block_priv *indr_priv,
+                      unsigned long flags)
+{
+       struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev);
+       int err = 0;
+
+       switch (flower->command) {
+       case FLOW_CLS_REPLACE:
+               err = mlx5e_configure_flower(netdev, priv, flower, flags);
+               break;
+       case FLOW_CLS_DESTROY:
+               err = mlx5e_delete_flower(netdev, priv, flower, flags);
+               break;
+       case FLOW_CLS_STATS:
+               err = mlx5e_stats_flower(netdev, priv, flower, flags);
+               break;
+       default:
+               err = -EOPNOTSUPP;
+       }
+
+       return err;
+}
+
+static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type,
+                                     void *type_data, void *indr_priv)
+{
+       unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
+       struct mlx5e_rep_indr_block_priv *priv = indr_priv;
+
+       switch (type) {
+       case TC_SETUP_CLSFLOWER:
+               return mlx5e_rep_indr_offload(priv->netdev, type_data, priv,
+                                             flags);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type,
+                                     void *type_data, void *indr_priv)
+{
+       struct mlx5e_rep_indr_block_priv *priv = indr_priv;
+       struct flow_cls_offload *f = type_data;
+       struct flow_cls_offload tmp;
+       struct mlx5e_priv *mpriv;
+       struct mlx5_eswitch *esw;
+       unsigned long flags;
+       int err;
+
+       mpriv = netdev_priv(priv->rpriv->netdev);
+       esw = mpriv->mdev->priv.eswitch;
+
+       flags = MLX5_TC_FLAG(EGRESS) |
+               MLX5_TC_FLAG(ESW_OFFLOAD) |
+               MLX5_TC_FLAG(FT_OFFLOAD);
+
+       switch (type) {
+       case TC_SETUP_CLSFLOWER:
+               memcpy(&tmp, f, sizeof(*f));
+
+               /* Re-use tc offload path by moving the ft flow to the
+                * reserved ft chain.
+                *
+                * FT offload can use prio range [0, INT_MAX], so we normalize
+                * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
+                * as with tc, where prio 0 isn't supported.
+                *
+                * We only support chain 0 of FT offload.
+                */
+               if (!mlx5_esw_chains_prios_supported(esw) ||
+                   tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw) ||
+                   tmp.common.chain_index)
+                       return -EOPNOTSUPP;
+
+               tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
+               tmp.common.prio++;
+               err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags);
+               memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
+               return err;
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static void mlx5e_rep_indr_block_unbind(void *cb_priv)
+{
+       struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv;
+
+       list_del(&indr_priv->list);
+       kfree(indr_priv);
+}
+
+static LIST_HEAD(mlx5e_block_cb_list);
+
+static int
+mlx5e_rep_indr_setup_block(struct net_device *netdev,
+                          struct mlx5e_rep_priv *rpriv,
+                          struct flow_block_offload *f,
+                          flow_setup_cb_t *setup_cb)
+{
+       struct mlx5e_rep_indr_block_priv *indr_priv;
+       struct flow_block_cb *block_cb;
+
+       if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+               return -EOPNOTSUPP;
+
+       f->unlocked_driver_cb = true;
+       f->driver_block_list = &mlx5e_block_cb_list;
+
+       switch (f->command) {
+       case FLOW_BLOCK_BIND:
+               indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
+               if (indr_priv)
+                       return -EEXIST;
+
+               indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL);
+               if (!indr_priv)
+                       return -ENOMEM;
+
+               indr_priv->netdev = netdev;
+               indr_priv->rpriv = rpriv;
+               list_add(&indr_priv->list,
+                        &rpriv->uplink_priv.tc_indr_block_priv_list);
+
+               block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv,
+                                              mlx5e_rep_indr_block_unbind);
+               if (IS_ERR(block_cb)) {
+                       list_del(&indr_priv->list);
+                       kfree(indr_priv);
+                       return PTR_ERR(block_cb);
+               }
+               flow_block_cb_add(block_cb, f);
+               list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list);
+
+               return 0;
+       case FLOW_BLOCK_UNBIND:
+               indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
+               if (!indr_priv)
+                       return -ENOENT;
+
+               block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv);
+               if (!block_cb)
+                       return -ENOENT;
+
+               flow_block_cb_remove(block_cb, f);
+               list_del(&block_cb->driver_list);
+               return 0;
+       default:
+               return -EOPNOTSUPP;
+       }
+       return 0;
+}
+
+static
+int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv,
+                           enum tc_setup_type type, void *type_data)
+{
+       switch (type) {
+       case TC_SETUP_BLOCK:
+               return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
+                                                 mlx5e_rep_indr_setup_tc_cb);
+       case TC_SETUP_FT:
+               return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
+                                                 mlx5e_rep_indr_setup_ft_cb);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
+                                        struct net_device *netdev)
+{
+       int err;
+
+       err = __flow_indr_block_cb_register(netdev, rpriv,
+                                           mlx5e_rep_indr_setup_cb,
+                                           rpriv);
+       if (err) {
+               struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+               mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n",
+                             netdev_name(netdev), err);
+       }
+       return err;
+}
+
+static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
+                                           struct net_device *netdev)
+{
+       __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb,
+                                       rpriv);
+}
+
+static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb,
+                                        unsigned long event, void *ptr)
+{
+       struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
+                                                    uplink_priv.netdevice_nb);
+       struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+       struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+       if (!mlx5e_tc_tun_device_to_offload(priv, netdev) &&
+           !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev))
+               return NOTIFY_OK;
+
+       switch (event) {
+       case NETDEV_REGISTER:
+               mlx5e_rep_indr_register_block(rpriv, netdev);
+               break;
+       case NETDEV_UNREGISTER:
+               mlx5e_rep_indr_unregister_block(rpriv, netdev);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv)
+{
+       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+       int err;
+
+       /* init indirect block notifications */
+       INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list);
+
+       uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event;
+       err = register_netdevice_notifier_dev_net(rpriv->netdev,
+                                                 &uplink_priv->netdevice_nb,
+                                                 &uplink_priv->netdevice_nn);
+       return err;
+}
+
+void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv)
+{
+       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
+
+       /* clean indirect TC block notifications */
+       unregister_netdevice_notifier_dev_net(rpriv->netdev,
+                                             &uplink_priv->netdevice_nb,
+                                             &uplink_priv->netdevice_nn);
+}
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb,
+                                struct mlx5e_tc_update_priv *tc_priv,
+                                u32 tunnel_id)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+       struct tunnel_match_enc_opts enc_opts = {};
+       struct mlx5_rep_uplink_priv *uplink_priv;
+       struct mlx5e_rep_priv *uplink_rpriv;
+       struct metadata_dst *tun_dst;
+       struct tunnel_match_key key;
+       u32 tun_id, enc_opts_id;
+       struct net_device *dev;
+       int err;
+
+       enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
+       tun_id = tunnel_id >> ENC_OPTS_BITS;
+
+       if (!tun_id)
+               return true;
+
+       uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+       uplink_priv = &uplink_rpriv->uplink_priv;
+
+       err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key);
+       if (err) {
+               WARN_ON_ONCE(true);
+               netdev_dbg(priv->netdev,
+                          "Couldn't find tunnel for tun_id: %d, err: %d\n",
+                          tun_id, err);
+               return false;
+       }
+
+       if (enc_opts_id) {
+               err = mapping_find(uplink_priv->tunnel_enc_opts_mapping,
+                                  enc_opts_id, &enc_opts);
+               if (err) {
+                       netdev_dbg(priv->netdev,
+                                  "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n",
+                                  enc_opts_id, err);
+                       return false;
+               }
+       }
+
+       tun_dst = tun_rx_dst(enc_opts.key.len);
+       if (!tun_dst) {
+               WARN_ON_ONCE(true);
+               return false;
+       }
+
+       ip_tunnel_key_init(&tun_dst->u.tun_info.key,
+                          key.enc_ipv4.src, key.enc_ipv4.dst,
+                          key.enc_ip.tos, key.enc_ip.ttl,
+                          0, /* label */
+                          key.enc_tp.src, key.enc_tp.dst,
+                          key32_to_tunnel_id(key.enc_key_id.keyid),
+                          TUNNEL_KEY);
+
+       if (enc_opts.key.len)
+               ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
+                                       enc_opts.key.data,
+                                       enc_opts.key.len,
+                                       enc_opts.key.dst_opt_type);
+
+       skb_dst_set(skb, (struct dst_entry *)tun_dst);
+       dev = dev_get_by_index(&init_net, key.filter_ifindex);
+       if (!dev) {
+               netdev_dbg(priv->netdev,
+                          "Couldn't find tunnel device with ifindex: %d\n",
+                          key.filter_ifindex);
+               return false;
+       }
+
+       /* Set tun_dev so we do dev_put() after datapath */
+       tc_priv->tun_dev = dev;
+
+       skb->dev = dev;
+
+       return true;
+}
+#endif /* CONFIG_NET_TC_SKB_EXT */
+
+bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
+                            struct sk_buff *skb,
+                            struct mlx5e_tc_update_priv *tc_priv)
+{
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+       u32 chain = 0, reg_c0, reg_c1, tunnel_id, tuple_id;
+       struct mlx5_rep_uplink_priv *uplink_priv;
+       struct mlx5e_rep_priv *uplink_rpriv;
+       struct tc_skb_ext *tc_skb_ext;
+       struct mlx5_eswitch *esw;
+       struct mlx5e_priv *priv;
+       int tunnel_moffset;
+       int err;
+
+       reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
+       if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG)
+               reg_c0 = 0;
+       reg_c1 = be32_to_cpu(cqe->ft_metadata);
+
+       if (!reg_c0)
+               return true;
+
+       priv = netdev_priv(skb->dev);
+       esw = priv->mdev->priv.eswitch;
+
+       err = mlx5_eswitch_get_chain_for_tag(esw, reg_c0, &chain);
+       if (err) {
+               netdev_dbg(priv->netdev,
+                          "Couldn't find chain for chain tag: %d, err: %d\n",
+                          reg_c0, err);
+               return false;
+       }
+
+       if (chain) {
+               tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
+               if (!tc_skb_ext) {
+                       WARN_ON(1);
+                       return false;
+               }
+
+               tc_skb_ext->chain = chain;
+
+               tuple_id = reg_c1 & TUPLE_ID_MAX;
+
+               uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+               uplink_priv = &uplink_rpriv->uplink_priv;
+               if (!mlx5e_tc_ct_restore_flow(uplink_priv, skb, tuple_id))
+                       return false;
+       }
+
+       tunnel_moffset = mlx5e_tc_attr_to_reg_mappings[TUNNEL_TO_REG].moffset;
+       tunnel_id = reg_c1 >> (8 * tunnel_moffset);
+       return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
+#endif /* CONFIG_NET_TC_SKB_EXT */
+
+       return true;
+}
+
+void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv)
+{
+       if (tc_priv->tun_dev)
+               dev_put(tc_priv->tun_dev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h
new file mode 100644 (file)
index 0000000..86f92ab
--- /dev/null
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_REP_TC_H__
+#define __MLX5_EN_REP_TC_H__
+
+#include <linux/skbuff.h>
+#include "en_tc.h"
+#include "en_rep.h"
+
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
+int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv);
+void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv);
+
+int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv);
+void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv);
+
+void mlx5e_rep_tc_enable(struct mlx5e_priv *priv);
+void mlx5e_rep_tc_disable(struct mlx5e_priv *priv);
+
+int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv);
+
+void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
+                           struct mlx5e_encap_entry *e,
+                           bool neigh_connected,
+                           unsigned char ha[ETH_ALEN]);
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+                                struct mlx5e_encap_entry *e);
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+                                 struct mlx5e_encap_entry *e);
+
+int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+                      void *type_data);
+void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv);
+
+bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
+                            struct sk_buff *skb,
+                            struct mlx5e_tc_update_priv *tc_priv);
+void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv);
+
+#else /* CONFIG_MLX5_CLS_ACT */
+
+struct mlx5e_rep_priv;
+static inline int
+mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv) { return 0; }
+static inline void
+mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv) {}
+
+static inline int
+mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) { return 0; }
+static inline void
+mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) {}
+
+static inline void
+mlx5e_rep_tc_enable(struct mlx5e_priv *priv) {}
+static inline void
+mlx5e_rep_tc_disable(struct mlx5e_priv *priv) {}
+
+static inline int
+mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv) { return NOTIFY_DONE; }
+
+static inline int
+mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+                  void *type_data) { return -EOPNOTSUPP; }
+
+static inline void
+mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) {}
+
+struct mlx5e_tc_update_priv;
+static inline bool
+mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
+                       struct sk_buff *skb,
+                       struct mlx5e_tc_update_priv *tc_priv) { return true; }
+static inline void
+mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv) {}
+
+#endif /* CONFIG_MLX5_CLS_ACT */
+
+#endif /* __MLX5_EN_REP_TC_H__ */
index b45c3f46570bdc5b38f4d177afadd17c1b1c92d2..e99382f588077741197d2dfd258db61553d5b108 100644 (file)
@@ -4,8 +4,11 @@
 #include <net/vxlan.h>
 #include <net/gre.h>
 #include <net/geneve.h>
+#include <net/bareudp.h>
 #include "en/tc_tun.h"
 #include "en_tc.h"
+#include "rep/tc.h"
+#include "rep/neigh.h"
 
 struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev)
 {
@@ -16,6 +19,8 @@ struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev)
        else if (netif_is_gretap(tunnel_dev) ||
                 netif_is_ip6gretap(tunnel_dev))
                return &gre_tunnel;
+       else if (netif_is_bareudp(tunnel_dev))
+               return &mplsoudp_tunnel;
        else
                return NULL;
 }
@@ -96,9 +101,8 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
        }
 
        rt = ip_route_output_key(dev_net(mirred_dev), fl4);
-       ret = PTR_ERR_OR_ZERO(rt);
-       if (ret)
-               return ret;
+       if (IS_ERR(rt))
+               return PTR_ERR(rt);
 
        if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) {
                ip_rt_put(rt);
index 1630f0ec3ad7762efc68eb8ec22872488bac96c8..704359df60951e3a174dfedc14fa356ec639426f 100644 (file)
@@ -16,6 +16,7 @@ enum {
        MLX5E_TC_TUNNEL_TYPE_VXLAN,
        MLX5E_TC_TUNNEL_TYPE_GENEVE,
        MLX5E_TC_TUNNEL_TYPE_GRETAP,
+       MLX5E_TC_TUNNEL_TYPE_MPLSOUDP,
 };
 
 struct mlx5e_tc_tunnel {
@@ -46,6 +47,7 @@ struct mlx5e_tc_tunnel {
 extern struct mlx5e_tc_tunnel vxlan_tunnel;
 extern struct mlx5e_tc_tunnel geneve_tunnel;
 extern struct mlx5e_tc_tunnel gre_tunnel;
+extern struct mlx5e_tc_tunnel mplsoudp_tunnel;
 
 struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c
new file mode 100644 (file)
index 0000000..98ee62e
--- /dev/null
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2018 Mellanox Technologies. */
+
+#include <net/bareudp.h>
+#include <net/mpls.h>
+#include "en/tc_tun.h"
+
+static bool can_offload(struct mlx5e_priv *priv)
+{
+       return MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, reformat_l3_tunnel_to_l2);
+}
+
+static int calc_hlen(struct mlx5e_encap_entry *e)
+{
+       return sizeof(struct udphdr) + MPLS_HLEN;
+}
+
+static int init_encap_attr(struct net_device *tunnel_dev,
+                          struct mlx5e_priv *priv,
+                          struct mlx5e_encap_entry *e,
+                          struct netlink_ext_ack *extack)
+{
+       e->tunnel = &mplsoudp_tunnel;
+       e->reformat_type = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
+       return 0;
+}
+
+static inline __be32 mpls_label_id_field(__be32 label, u8 tos, u8 ttl)
+{
+       u32 res;
+
+       /* mpls label is 32 bits long and construction as follows:
+        * 20 bits label
+        * 3 bits tos
+        * 1 bit bottom of stack. Since we support only one label, this bit is
+        *       always set.
+        * 8 bits TTL
+        */
+       res = be32_to_cpu(label) << 12 | 1 << 8 | (tos & 7) <<  9 | ttl;
+       return cpu_to_be32(res);
+}
+
+static int generate_ip_tun_hdr(char buf[],
+                              __u8 *ip_proto,
+                              struct mlx5e_encap_entry *r)
+{
+       const struct ip_tunnel_key *tun_key = &r->tun_info->key;
+       __be32 tun_id = tunnel_id_to_key32(tun_key->tun_id);
+       struct udphdr *udp = (struct udphdr *)(buf);
+       struct mpls_shim_hdr *mpls;
+
+       mpls = (struct mpls_shim_hdr *)(udp + 1);
+       *ip_proto = IPPROTO_UDP;
+
+       udp->dest = tun_key->tp_dst;
+       mpls->label_stack_entry = mpls_label_id_field(tun_id, tun_key->tos, tun_key->ttl);
+
+       return 0;
+}
+
+static int parse_udp_ports(struct mlx5e_priv *priv,
+                          struct mlx5_flow_spec *spec,
+                          struct flow_cls_offload *f,
+                          void *headers_c,
+                          void *headers_v)
+{
+       return mlx5e_tc_tun_parse_udp_ports(priv, spec, f, headers_c, headers_v);
+}
+
+static int parse_tunnel(struct mlx5e_priv *priv,
+                       struct mlx5_flow_spec *spec,
+                       struct flow_cls_offload *f,
+                       void *headers_c,
+                       void *headers_v)
+{
+       struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+       struct flow_match_enc_keyid enc_keyid;
+       struct flow_match_mpls match;
+       void *misc2_c;
+       void *misc2_v;
+
+       misc2_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+                              misc_parameters_2);
+       misc2_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+                              misc_parameters_2);
+
+       if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS))
+               return 0;
+
+       if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID))
+               return 0;
+
+       flow_rule_match_enc_keyid(rule, &enc_keyid);
+
+       if (!enc_keyid.mask->keyid)
+               return 0;
+
+       if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+             MLX5_FLEX_PROTO_CW_MPLS_UDP))
+               return -EOPNOTSUPP;
+
+       flow_rule_match_mpls(rule, &match);
+
+       MLX5_SET(fte_match_set_misc2, misc2_c,
+                outer_first_mpls_over_udp.mpls_label, match.mask->mpls_label);
+       MLX5_SET(fte_match_set_misc2, misc2_v,
+                outer_first_mpls_over_udp.mpls_label, match.key->mpls_label);
+
+       MLX5_SET(fte_match_set_misc2, misc2_c,
+                outer_first_mpls_over_udp.mpls_exp, match.mask->mpls_tc);
+       MLX5_SET(fte_match_set_misc2, misc2_v,
+                outer_first_mpls_over_udp.mpls_exp, match.key->mpls_tc);
+
+       MLX5_SET(fte_match_set_misc2, misc2_c,
+                outer_first_mpls_over_udp.mpls_s_bos, match.mask->mpls_bos);
+       MLX5_SET(fte_match_set_misc2, misc2_v,
+                outer_first_mpls_over_udp.mpls_s_bos, match.key->mpls_bos);
+
+       MLX5_SET(fte_match_set_misc2, misc2_c,
+                outer_first_mpls_over_udp.mpls_ttl, match.mask->mpls_ttl);
+       MLX5_SET(fte_match_set_misc2, misc2_v,
+                outer_first_mpls_over_udp.mpls_ttl, match.key->mpls_ttl);
+       spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2;
+
+       return 0;
+}
+
+struct mlx5e_tc_tunnel mplsoudp_tunnel = {
+       .tunnel_type          = MLX5E_TC_TUNNEL_TYPE_MPLSOUDP,
+       .match_level          = MLX5_MATCH_L4,
+       .can_offload          = can_offload,
+       .calc_hlen            = calc_hlen,
+       .init_encap_attr      = init_encap_attr,
+       .generate_ip_tun_hdr  = generate_ip_tun_hdr,
+       .parse_udp_ports      = parse_udp_ports,
+       .parse_tunnel         = parse_tunnel,
+};
index 42202d19245cfdced3640ac3039b04b89f3be220..3bea1d4be53b7f832824b80ff2c2edc3078deef6 100644 (file)
@@ -31,7 +31,7 @@
  */
 
 #include <linux/bpf_trace.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include "en/xdp.h"
 #include "en/params.h"
 
@@ -71,7 +71,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
        xdptxd.data = xdpf->data;
        xdptxd.len  = xdpf->len;
 
-       if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) {
+       if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
                /* The xdp_buff was in the UMEM and was copied into a newly
                 * allocated page. The UMEM page was returned via the ZCA, and
                 * this new page has to be mapped at this point and has to be
@@ -119,50 +119,33 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
 
 /* returns true if packet was consumed by xdp */
 bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
-                     void *va, u16 *rx_headroom, u32 *len, bool xsk)
+                     u32 *len, struct xdp_buff *xdp)
 {
        struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
-       struct xdp_umem *umem = rq->umem;
-       struct xdp_buff xdp;
        u32 act;
        int err;
 
        if (!prog)
                return false;
 
-       xdp.data = va + *rx_headroom;
-       xdp_set_data_meta_invalid(&xdp);
-       xdp.data_end = xdp.data + *len;
-       xdp.data_hard_start = va;
-       if (xsk)
-               xdp.handle = di->xsk.handle;
-       xdp.rxq = &rq->xdp_rxq;
-       xdp.frame_sz = rq->buff.frame0_sz;
-
-       act = bpf_prog_run_xdp(prog, &xdp);
-       if (xsk) {
-               u64 off = xdp.data - xdp.data_hard_start;
-
-               xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
-       }
+       act = bpf_prog_run_xdp(prog, xdp);
        switch (act) {
        case XDP_PASS:
-               *rx_headroom = xdp.data - xdp.data_hard_start;
-               *len = xdp.data_end - xdp.data;
+               *len = xdp->data_end - xdp->data;
                return false;
        case XDP_TX:
-               if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp)))
+               if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp)))
                        goto xdp_abort;
                __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
                return true;
        case XDP_REDIRECT:
                /* When XDP enabled then page-refcnt==1 here */
-               err = xdp_do_redirect(rq->netdev, &xdp, prog);
+               err = xdp_do_redirect(rq->netdev, xdp, prog);
                if (unlikely(err))
                        goto xdp_abort;
                __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
                __set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
-               if (!xsk)
+               if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
                        mlx5e_page_dma_unmap(rq, di);
                rq->stats->xdp_redirect++;
                return true;
index be64eb68f4e5c1e39343f09dc6a20cd88e0ebdae..ca48c293151be53824e582ffa840aaa3035b5d83 100644 (file)
@@ -61,7 +61,7 @@
 struct mlx5e_xsk_param;
 int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
 bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
-                     void *va, u16 *rx_headroom, u32 *len, bool xsk);
+                     u32 *len, struct xdp_buff *xdp);
 void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
 bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
 void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
index 62fc8a128a8dfd4576037a9862f892794d9da3ec..a33a1f762c70db7d2722984b159e436410fa3e45 100644 (file)
@@ -3,71 +3,10 @@
 
 #include "rx.h"
 #include "en/xdp.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 /* RX data path */
 
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
-{
-       /* Check in advance that we have enough frames, instead of allocating
-        * one-by-one, failing and moving frames to the Reuse Ring.
-        */
-       return xsk_umem_has_addrs_rq(rq->umem, count);
-}
-
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
-                             struct mlx5e_dma_info *dma_info)
-{
-       struct xdp_umem *umem = rq->umem;
-       u64 handle;
-
-       if (!xsk_umem_peek_addr_rq(umem, &handle))
-               return -ENOMEM;
-
-       dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
-                                                     rq->buff.umem_headroom);
-       dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
-
-       /* No need to add headroom to the DMA address. In striding RQ case, we
-        * just provide pages for UMR, and headroom is counted at the setup
-        * stage when creating a WQE. In non-striding RQ case, headroom is
-        * accounted in mlx5e_alloc_rx_wqe.
-        */
-       dma_info->addr = xdp_umem_get_dma(umem, handle);
-
-       xsk_umem_release_addr_rq(umem);
-
-       dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
-                                  DMA_BIDIRECTIONAL);
-
-       return 0;
-}
-
-static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
-{
-       xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
-}
-
-/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
- * the userspace if possible, and if not, this function is called to reuse them
- * in the driver.
- */
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
-                           struct mlx5e_dma_info *dma_info)
-{
-       mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
-}
-
-/* Return a frame back to the hardware to fill in again. It is used by XDP when
- * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
- */
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
-{
-       struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
-
-       mlx5e_xsk_recycle_frame(rq, handle);
-}
-
 static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
                                               u32 cqe_bcnt)
 {
@@ -90,11 +29,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
                                                    u32 head_offset,
                                                    u32 page_idx)
 {
-       struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
-       u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
+       struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk;
        u32 cqe_bcnt32 = cqe_bcnt;
-       void *va, *data;
-       u32 frag_size;
        bool consumed;
 
        /* Check packet size. Note LRO doesn't use linear SKB */
@@ -103,22 +39,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
                return NULL;
        }
 
-       /* head_offset is not used in this function, because di->xsk.data and
-        * di->addr point directly to the necessary place. Furthermore, in the
-        * current implementation, UMR pages are mapped to XSK frames, so
+       /* head_offset is not used in this function, because xdp->data and the
+        * DMA address point directly to the necessary place. Furthermore, in
+        * the current implementation, UMR pages are mapped to XSK frames, so
         * head_offset should always be 0.
         */
        WARN_ON_ONCE(head_offset);
 
-       va             = di->xsk.data;
-       data           = va + rx_headroom;
-       frag_size      = rq->buff.headroom + cqe_bcnt32;
-
-       dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
-       prefetch(data);
+       xdp->data_end = xdp->data + cqe_bcnt32;
+       xdp_set_data_meta_invalid(xdp);
+       xsk_buff_dma_sync_for_cpu(xdp);
+       prefetch(xdp->data);
 
        rcu_read_lock();
-       consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
+       consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp);
        rcu_read_unlock();
 
        /* Possible flows:
@@ -145,7 +79,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
        /* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
         * frame. On SKB allocation failure, NULL is returned.
         */
-       return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
+       return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32);
 }
 
 struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
@@ -153,25 +87,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
                                              struct mlx5e_wqe_frag_info *wi,
                                              u32 cqe_bcnt)
 {
-       struct mlx5e_dma_info *di = wi->di;
-       u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
-       void *va, *data;
+       struct xdp_buff *xdp = wi->di->xsk;
        bool consumed;
-       u32 frag_size;
 
-       /* wi->offset is not used in this function, because di->xsk.data and
-        * di->addr point directly to the necessary place. Furthermore, in the
-        * current implementation, one page = one packet = one frame, so
+       /* wi->offset is not used in this function, because xdp->data and the
+        * DMA address point directly to the necessary place. Furthermore, the
+        * XSK allocator allocates frames per packet, instead of pages, so
         * wi->offset should always be 0.
         */
        WARN_ON_ONCE(wi->offset);
 
-       va             = di->xsk.data;
-       data           = va + rx_headroom;
-       frag_size      = rq->buff.headroom + cqe_bcnt;
-
-       dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
-       prefetch(data);
+       xdp->data_end = xdp->data + cqe_bcnt;
+       xdp_set_data_meta_invalid(xdp);
+       xsk_buff_dma_sync_for_cpu(xdp);
+       prefetch(xdp->data);
 
        if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
                rq->stats->wqe_err++;
@@ -179,7 +108,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
        }
 
        rcu_read_lock();
-       consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
+       consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp);
        rcu_read_unlock();
 
        if (likely(consumed))
@@ -189,5 +118,5 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
         * will be handled by mlx5e_put_rx_frag.
         * On SKB allocation failure, NULL is returned.
         */
-       return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
+       return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt);
 }
index cab0e93497ae6d2cf53197e86f5ec3ee3df8607d..d147b2f13b54b8ea69fc1450df4d51bea2f697f8 100644 (file)
@@ -5,16 +5,10 @@
 #define __MLX5_EN_XSK_RX_H__
 
 #include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 /* RX data path */
 
-bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
-int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
-                             struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
-                           struct mlx5e_dma_info *dma_info);
-void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
 struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
                                                    struct mlx5e_mpw_info *wi,
                                                    u16 cqe_bcnt,
@@ -25,6 +19,23 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
                                              struct mlx5e_wqe_frag_info *wi,
                                              u32 cqe_bcnt);
 
+static inline int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
+                                           struct mlx5e_dma_info *dma_info)
+{
+       dma_info->xsk = xsk_buff_alloc(rq->umem);
+       if (!dma_info->xsk)
+               return -ENOMEM;
+
+       /* Store the DMA address without headroom. In striding RQ case, we just
+        * provide pages for UMR, and headroom is counted at the setup stage
+        * when creating a WQE. In non-striding RQ case, headroom is accounted
+        * in mlx5e_alloc_rx_wqe.
+        */
+       dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk);
+
+       return 0;
+}
+
 static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
 {
        if (!xsk_umem_uses_need_wakeup(rq->umem))
index 3bcdb5b2fc2034f7819c2a0a86cdbe89b09d5564..83dce9cdb8c2fc1aa02ae207bab741840554ec90 100644 (file)
@@ -5,7 +5,7 @@
 #include "umem.h"
 #include "en/xdp.h"
 #include "en/params.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 {
@@ -92,12 +92,11 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
                        break;
                }
 
-               xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
-               xdptxd.data = xdp_umem_get_data(umem, desc.addr);
+               xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr);
+               xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr);
                xdptxd.len = desc.len;
 
-               dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
-                                          xdptxd.len, DMA_BIDIRECTIONAL);
+               xsk_buff_raw_dma_sync_for_device(umem, xdptxd.dma_addr, xdptxd.len);
 
                if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) {
                        if (sq->mpwqe.wqe)
index 79b487d897570d1c7194d3b4e9b02de29078d246..39fa0a70585690a2599c8bbe61efc06a9a21059e 100644 (file)
@@ -5,7 +5,7 @@
 #define __MLX5_EN_XSK_TX_H__
 
 #include "en.h"
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 /* TX data path */
 
index 4baaa5788320ac47e848ea6ab8249e79301bedbb..7b17fcd0a56d7d9aa7784967324c895d672241c3 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2019 Mellanox Technologies. */
 
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include "umem.h"
 #include "setup.h"
 #include "en/params.h"
@@ -10,40 +10,14 @@ static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv,
                              struct xdp_umem *umem)
 {
        struct device *dev = priv->mdev->device;
-       u32 i;
 
-       for (i = 0; i < umem->npgs; i++) {
-               dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
-                                             DMA_BIDIRECTIONAL);
-
-               if (unlikely(dma_mapping_error(dev, dma)))
-                       goto err_unmap;
-               umem->pages[i].dma = dma;
-       }
-
-       return 0;
-
-err_unmap:
-       while (i--) {
-               dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
-                              DMA_BIDIRECTIONAL);
-               umem->pages[i].dma = 0;
-       }
-
-       return -ENOMEM;
+       return xsk_buff_dma_map(umem, dev, 0);
 }
 
 static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
                                 struct xdp_umem *umem)
 {
-       struct device *dev = priv->mdev->device;
-       u32 i;
-
-       for (i = 0; i < umem->npgs; i++) {
-               dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
-                              DMA_BIDIRECTIONAL);
-               umem->pages[i].dma = 0;
-       }
+       return xsk_buff_dma_unmap(umem, 0);
 }
 
 static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
@@ -90,13 +64,14 @@ static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix)
 
 static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
 {
-       return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
+       return xsk_umem_get_headroom(umem) <= 0xffff &&
+               xsk_umem_get_chunk_size(umem) <= 0xffff;
 }
 
 void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
 {
-       xsk->headroom = umem->headroom;
-       xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
+       xsk->headroom = xsk_umem_get_headroom(umem);
+       xsk->chunk_size = xsk_umem_get_chunk_size(umem);
 }
 
 static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
@@ -241,18 +216,6 @@ int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid)
                      mlx5e_xsk_disable_umem(priv, ix);
 }
 
-int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
-{
-       struct xdp_umem_fq_reuse *reuseq;
-
-       reuseq = xsk_reuseq_prepare(nentries);
-       if (unlikely(!reuseq))
-               return -ENOMEM;
-       xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
-
-       return 0;
-}
-
 u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
 {
        u16 res = xsk->refcnt ? params->num_channels : 0;
index 07823abe555755dca69c18d54be103439b5cacc3..8ff9cbe6943db6844aa357ed9ab7108869f931e0 100644 (file)
@@ -38,7 +38,7 @@
 #include <linux/bpf.h>
 #include <linux/if_bridge.h>
 #include <net/page_pool.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include "eswitch.h"
 #include "en.h"
 #include "en/txrx.h"
@@ -373,7 +373,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
        struct mlx5_core_dev *mdev = c->mdev;
        void *rqc = rqp->rqc;
        void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-       u32 num_xsk_frames = 0;
        u32 rq_xdp_ix;
        u32 pool_size;
        int wq_sz;
@@ -413,7 +412,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
        rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
        rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk);
-       rq->buff.umem_headroom = xsk ? xsk->headroom : 0;
        pool_size = 1 << params->log_rq_mtu_frames;
 
        switch (rq->wq_type) {
@@ -427,10 +425,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
                wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
-               if (xsk)
-                       num_xsk_frames = wq_sz <<
-                               mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
-
                pool_size = MLX5_MPWRQ_PAGES_PER_WQE <<
                        mlx5e_mpwqe_get_log_rq_size(params, xsk);
 
@@ -482,9 +476,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
                wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
 
-               if (xsk)
-                       num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags;
-
                rq->wqe.info = rqp->frags_info;
                rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride;
 
@@ -525,19 +516,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
        }
 
        if (xsk) {
-               rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem);
-
-               err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
-               if (unlikely(err)) {
-                       mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
-                                     num_xsk_frames);
-                       goto err_free;
-               }
-
-               rq->zca.free = mlx5e_xsk_zca_free;
                err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
-                                                MEM_TYPE_ZERO_COPY,
-                                                &rq->zca);
+                                                MEM_TYPE_XSK_BUFF_POOL, NULL);
+               xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq);
        } else {
                /* Create a page_pool and register it with rxq */
                pp_params.order     = 0;
@@ -3539,41 +3520,6 @@ out:
        return err;
 }
 
-#ifdef CONFIG_MLX5_ESWITCH
-static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
-                                    struct flow_cls_offload *cls_flower,
-                                    unsigned long flags)
-{
-       switch (cls_flower->command) {
-       case FLOW_CLS_REPLACE:
-               return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
-                                             flags);
-       case FLOW_CLS_DESTROY:
-               return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
-                                          flags);
-       case FLOW_CLS_STATS:
-               return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
-                                         flags);
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
-                                  void *cb_priv)
-{
-       unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD);
-       struct mlx5e_priv *priv = cb_priv;
-
-       switch (type) {
-       case TC_SETUP_CLSFLOWER:
-               return mlx5e_setup_tc_cls_flower(priv, type_data, flags);
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-#endif
-
 static LIST_HEAD(mlx5e_block_cb_list);
 
 static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
@@ -3582,7 +3528,6 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
        struct mlx5e_priv *priv = netdev_priv(dev);
 
        switch (type) {
-#ifdef CONFIG_MLX5_ESWITCH
        case TC_SETUP_BLOCK: {
                struct flow_block_offload *f = type_data;
 
@@ -3592,7 +3537,6 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
                                                  mlx5e_setup_tc_block_cb,
                                                  priv, priv, true);
        }
-#endif
        case TC_SETUP_QDISC_MQPRIO:
                return mlx5e_setup_tc_mqprio(priv, type_data);
        default:
@@ -3765,7 +3709,7 @@ static int set_feature_cvlan_filter(struct net_device *netdev, bool enable)
        return 0;
 }
 
-#ifdef CONFIG_MLX5_ESWITCH
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
 static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
 {
        struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -3876,7 +3820,7 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features)
        err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
        err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
                                    set_feature_cvlan_filter);
-#ifdef CONFIG_MLX5_ESWITCH
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
        err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters);
 #endif
        err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
index 52351c1056277e801c9b38f5571f1251b35cf30a..a46405c6d5600e6767f827cbbff36207249d293a 100644 (file)
@@ -35,7 +35,6 @@
 #include <net/switchdev.h>
 #include <net/pkt_cls.h>
 #include <net/act_api.h>
-#include <net/netevent.h>
 #include <net/arp.h>
 #include <net/devlink.h>
 #include <net/ipv6_stubs.h>
@@ -45,9 +44,9 @@
 #include "en.h"
 #include "en_rep.h"
 #include "en_tc.h"
-#include "en/tc_tun.h"
+#include "en/rep/tc.h"
+#include "en/rep/neigh.h"
 #include "fs_core.h"
-#include "lib/port_tun.h"
 #include "lib/mlx5.h"
 #define CREATE_TRACE_POINTS
 #include "diag/en_rep_tracepoint.h"
 
 static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
 
-struct mlx5e_rep_indr_block_priv {
-       struct net_device *netdev;
-       struct mlx5e_rep_priv *rpriv;
-
-       struct list_head list;
-};
-
-static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
-                                           struct net_device *netdev);
-
 static void mlx5e_rep_get_drvinfo(struct net_device *dev,
                                  struct ethtool_drvinfo *drvinfo)
 {
@@ -485,706 +474,6 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
        mlx5e_sqs2vport_stop(esw, rep);
 }
 
-static unsigned long mlx5e_rep_ipv6_interval(void)
-{
-       if (IS_ENABLED(CONFIG_IPV6) && ipv6_stub->nd_tbl)
-               return NEIGH_VAR(&ipv6_stub->nd_tbl->parms, DELAY_PROBE_TIME);
-
-       return ~0UL;
-}
-
-static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
-{
-       unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME);
-       unsigned long ipv6_interval = mlx5e_rep_ipv6_interval();
-       struct net_device *netdev = rpriv->netdev;
-       struct mlx5e_priv *priv = netdev_priv(netdev);
-
-       rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
-       mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
-}
-
-void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
-{
-       struct mlx5e_rep_priv *rpriv = priv->ppriv;
-       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
-
-       mlx5_fc_queue_stats_work(priv->mdev,
-                                &neigh_update->neigh_stats_work,
-                                neigh_update->min_interval);
-}
-
-static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
-{
-       return refcount_inc_not_zero(&nhe->refcnt);
-}
-
-static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe);
-
-static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
-{
-       if (refcount_dec_and_test(&nhe->refcnt)) {
-               mlx5e_rep_neigh_entry_remove(nhe);
-               kfree_rcu(nhe, rcu);
-       }
-}
-
-static struct mlx5e_neigh_hash_entry *
-mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv,
-                  struct mlx5e_neigh_hash_entry *nhe)
-{
-       struct mlx5e_neigh_hash_entry *next = NULL;
-
-       rcu_read_lock();
-
-       for (next = nhe ?
-                    list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
-                                          &nhe->neigh_list,
-                                          struct mlx5e_neigh_hash_entry,
-                                          neigh_list) :
-                    list_first_or_null_rcu(&rpriv->neigh_update.neigh_list,
-                                           struct mlx5e_neigh_hash_entry,
-                                           neigh_list);
-            next;
-            next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list,
-                                         &next->neigh_list,
-                                         struct mlx5e_neigh_hash_entry,
-                                         neigh_list))
-               if (mlx5e_rep_neigh_entry_hold(next))
-                       break;
-
-       rcu_read_unlock();
-
-       if (nhe)
-               mlx5e_rep_neigh_entry_release(nhe);
-
-       return next;
-}
-
-static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
-{
-       struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
-                                                   neigh_update.neigh_stats_work.work);
-       struct net_device *netdev = rpriv->netdev;
-       struct mlx5e_priv *priv = netdev_priv(netdev);
-       struct mlx5e_neigh_hash_entry *nhe = NULL;
-
-       rtnl_lock();
-       if (!list_empty(&rpriv->neigh_update.neigh_list))
-               mlx5e_rep_queue_neigh_stats_work(priv);
-
-       while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL)
-               mlx5e_tc_update_neigh_used_value(nhe);
-
-       rtnl_unlock();
-}
-
-static void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
-                                  struct mlx5e_encap_entry *e,
-                                  bool neigh_connected,
-                                  unsigned char ha[ETH_ALEN])
-{
-       struct ethhdr *eth = (struct ethhdr *)e->encap_header;
-       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-       bool encap_connected;
-       LIST_HEAD(flow_list);
-
-       ASSERT_RTNL();
-
-       /* wait for encap to be fully initialized */
-       wait_for_completion(&e->res_ready);
-
-       mutex_lock(&esw->offloads.encap_tbl_lock);
-       encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
-       if (e->compl_result < 0 || (encap_connected == neigh_connected &&
-                                   ether_addr_equal(e->h_dest, ha)))
-               goto unlock;
-
-       mlx5e_take_all_encap_flows(e, &flow_list);
-
-       if ((e->flags & MLX5_ENCAP_ENTRY_VALID) &&
-           (!neigh_connected || !ether_addr_equal(e->h_dest, ha)))
-               mlx5e_tc_encap_flows_del(priv, e, &flow_list);
-
-       if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
-               ether_addr_copy(e->h_dest, ha);
-               ether_addr_copy(eth->h_dest, ha);
-               /* Update the encap source mac, in case that we delete
-                * the flows when encap source mac changed.
-                */
-               ether_addr_copy(eth->h_source, e->route_dev->dev_addr);
-
-               mlx5e_tc_encap_flows_add(priv, e, &flow_list);
-       }
-unlock:
-       mutex_unlock(&esw->offloads.encap_tbl_lock);
-       mlx5e_put_encap_flow_list(priv, &flow_list);
-}
-
-static void mlx5e_rep_neigh_update(struct work_struct *work)
-{
-       struct mlx5e_neigh_hash_entry *nhe =
-               container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work);
-       struct neighbour *n = nhe->n;
-       struct mlx5e_encap_entry *e;
-       unsigned char ha[ETH_ALEN];
-       struct mlx5e_priv *priv;
-       bool neigh_connected;
-       u8 nud_state, dead;
-
-       rtnl_lock();
-
-       /* If these parameters are changed after we release the lock,
-        * we'll receive another event letting us know about it.
-        * We use this lock to avoid inconsistency between the neigh validity
-        * and it's hw address.
-        */
-       read_lock_bh(&n->lock);
-       memcpy(ha, n->ha, ETH_ALEN);
-       nud_state = n->nud_state;
-       dead = n->dead;
-       read_unlock_bh(&n->lock);
-
-       neigh_connected = (nud_state & NUD_VALID) && !dead;
-
-       trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected);
-
-       list_for_each_entry(e, &nhe->encap_list, encap_list) {
-               if (!mlx5e_encap_take(e))
-                       continue;
-
-               priv = netdev_priv(e->out_dev);
-               mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
-               mlx5e_encap_put(priv, e);
-       }
-       mlx5e_rep_neigh_entry_release(nhe);
-       rtnl_unlock();
-       neigh_release(n);
-}
-
-static struct mlx5e_rep_indr_block_priv *
-mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv,
-                                struct net_device *netdev)
-{
-       struct mlx5e_rep_indr_block_priv *cb_priv;
-
-       /* All callback list access should be protected by RTNL. */
-       ASSERT_RTNL();
-
-       list_for_each_entry(cb_priv,
-                           &rpriv->uplink_priv.tc_indr_block_priv_list,
-                           list)
-               if (cb_priv->netdev == netdev)
-                       return cb_priv;
-
-       return NULL;
-}
-
-static void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv)
-{
-       struct mlx5e_rep_indr_block_priv *cb_priv, *temp;
-       struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list;
-
-       list_for_each_entry_safe(cb_priv, temp, head, list) {
-               mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev);
-               kfree(cb_priv);
-       }
-}
-
-static int
-mlx5e_rep_indr_offload(struct net_device *netdev,
-                      struct flow_cls_offload *flower,
-                      struct mlx5e_rep_indr_block_priv *indr_priv,
-                      unsigned long flags)
-{
-       struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev);
-       int err = 0;
-
-       switch (flower->command) {
-       case FLOW_CLS_REPLACE:
-               err = mlx5e_configure_flower(netdev, priv, flower, flags);
-               break;
-       case FLOW_CLS_DESTROY:
-               err = mlx5e_delete_flower(netdev, priv, flower, flags);
-               break;
-       case FLOW_CLS_STATS:
-               err = mlx5e_stats_flower(netdev, priv, flower, flags);
-               break;
-       default:
-               err = -EOPNOTSUPP;
-       }
-
-       return err;
-}
-
-static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type,
-                                     void *type_data, void *indr_priv)
-{
-       unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
-       struct mlx5e_rep_indr_block_priv *priv = indr_priv;
-
-       switch (type) {
-       case TC_SETUP_CLSFLOWER:
-               return mlx5e_rep_indr_offload(priv->netdev, type_data, priv,
-                                             flags);
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type,
-                                     void *type_data, void *indr_priv)
-{
-       struct mlx5e_rep_indr_block_priv *priv = indr_priv;
-       struct flow_cls_offload *f = type_data;
-       struct flow_cls_offload tmp;
-       struct mlx5e_priv *mpriv;
-       struct mlx5_eswitch *esw;
-       unsigned long flags;
-       int err;
-
-       mpriv = netdev_priv(priv->rpriv->netdev);
-       esw = mpriv->mdev->priv.eswitch;
-
-       flags = MLX5_TC_FLAG(EGRESS) |
-               MLX5_TC_FLAG(ESW_OFFLOAD) |
-               MLX5_TC_FLAG(FT_OFFLOAD);
-
-       switch (type) {
-       case TC_SETUP_CLSFLOWER:
-               memcpy(&tmp, f, sizeof(*f));
-
-               /* Re-use tc offload path by moving the ft flow to the
-                * reserved ft chain.
-                *
-                * FT offload can use prio range [0, INT_MAX], so we normalize
-                * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
-                * as with tc, where prio 0 isn't supported.
-                *
-                * We only support chain 0 of FT offload.
-                */
-               if (!mlx5_esw_chains_prios_supported(esw) ||
-                   tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw) ||
-                   tmp.common.chain_index)
-                       return -EOPNOTSUPP;
-
-               tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
-               tmp.common.prio++;
-               err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags);
-               memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
-               return err;
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static void mlx5e_rep_indr_block_unbind(void *cb_priv)
-{
-       struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv;
-
-       list_del(&indr_priv->list);
-       kfree(indr_priv);
-}
-
-static LIST_HEAD(mlx5e_block_cb_list);
-
-static int
-mlx5e_rep_indr_setup_block(struct net_device *netdev,
-                          struct mlx5e_rep_priv *rpriv,
-                          struct flow_block_offload *f,
-                          flow_setup_cb_t *setup_cb)
-{
-       struct mlx5e_rep_indr_block_priv *indr_priv;
-       struct flow_block_cb *block_cb;
-
-       if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-               return -EOPNOTSUPP;
-
-       f->unlocked_driver_cb = true;
-       f->driver_block_list = &mlx5e_block_cb_list;
-
-       switch (f->command) {
-       case FLOW_BLOCK_BIND:
-               indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
-               if (indr_priv)
-                       return -EEXIST;
-
-               indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL);
-               if (!indr_priv)
-                       return -ENOMEM;
-
-               indr_priv->netdev = netdev;
-               indr_priv->rpriv = rpriv;
-               list_add(&indr_priv->list,
-                        &rpriv->uplink_priv.tc_indr_block_priv_list);
-
-               block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv,
-                                              mlx5e_rep_indr_block_unbind);
-               if (IS_ERR(block_cb)) {
-                       list_del(&indr_priv->list);
-                       kfree(indr_priv);
-                       return PTR_ERR(block_cb);
-               }
-               flow_block_cb_add(block_cb, f);
-               list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list);
-
-               return 0;
-       case FLOW_BLOCK_UNBIND:
-               indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev);
-               if (!indr_priv)
-                       return -ENOENT;
-
-               block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv);
-               if (!block_cb)
-                       return -ENOENT;
-
-               flow_block_cb_remove(block_cb, f);
-               list_del(&block_cb->driver_list);
-               return 0;
-       default:
-               return -EOPNOTSUPP;
-       }
-       return 0;
-}
-
-static
-int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv,
-                           enum tc_setup_type type, void *type_data)
-{
-       switch (type) {
-       case TC_SETUP_BLOCK:
-               return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
-                                                 mlx5e_rep_indr_setup_tc_cb);
-       case TC_SETUP_FT:
-               return mlx5e_rep_indr_setup_block(netdev, cb_priv, type_data,
-                                                 mlx5e_rep_indr_setup_ft_cb);
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
-                                        struct net_device *netdev)
-{
-       int err;
-
-       err = __flow_indr_block_cb_register(netdev, rpriv,
-                                           mlx5e_rep_indr_setup_cb,
-                                           rpriv);
-       if (err) {
-               struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
-
-               mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n",
-                             netdev_name(netdev), err);
-       }
-       return err;
-}
-
-static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
-                                           struct net_device *netdev)
-{
-       __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb,
-                                       rpriv);
-}
-
-static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb,
-                                        unsigned long event, void *ptr)
-{
-       struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
-                                                    uplink_priv.netdevice_nb);
-       struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
-       struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
-
-       if (!mlx5e_tc_tun_device_to_offload(priv, netdev) &&
-           !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev))
-               return NOTIFY_OK;
-
-       switch (event) {
-       case NETDEV_REGISTER:
-               mlx5e_rep_indr_register_block(rpriv, netdev);
-               break;
-       case NETDEV_UNREGISTER:
-               mlx5e_rep_indr_unregister_block(rpriv, netdev);
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static void
-mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv,
-                                 struct mlx5e_neigh_hash_entry *nhe,
-                                 struct neighbour *n)
-{
-       /* Take a reference to ensure the neighbour and mlx5 encap
-        * entry won't be destructed until we drop the reference in
-        * delayed work.
-        */
-       neigh_hold(n);
-
-       /* This assignment is valid as long as the the neigh reference
-        * is taken
-        */
-       nhe->n = n;
-
-       if (!queue_work(priv->wq, &nhe->neigh_update_work)) {
-               mlx5e_rep_neigh_entry_release(nhe);
-               neigh_release(n);
-       }
-}
-
-static struct mlx5e_neigh_hash_entry *
-mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
-                            struct mlx5e_neigh *m_neigh);
-
-static int mlx5e_rep_netevent_event(struct notifier_block *nb,
-                                   unsigned long event, void *ptr)
-{
-       struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
-                                                   neigh_update.netevent_nb);
-       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
-       struct net_device *netdev = rpriv->netdev;
-       struct mlx5e_priv *priv = netdev_priv(netdev);
-       struct mlx5e_neigh_hash_entry *nhe = NULL;
-       struct mlx5e_neigh m_neigh = {};
-       struct neigh_parms *p;
-       struct neighbour *n;
-       bool found = false;
-
-       switch (event) {
-       case NETEVENT_NEIGH_UPDATE:
-               n = ptr;
-#if IS_ENABLED(CONFIG_IPV6)
-               if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl)
-#else
-               if (n->tbl != &arp_tbl)
-#endif
-                       return NOTIFY_DONE;
-
-               m_neigh.dev = n->dev;
-               m_neigh.family = n->ops->family;
-               memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
-
-               rcu_read_lock();
-               nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh);
-               rcu_read_unlock();
-               if (!nhe)
-                       return NOTIFY_DONE;
-
-               mlx5e_rep_queue_neigh_update_work(priv, nhe, n);
-               break;
-
-       case NETEVENT_DELAY_PROBE_TIME_UPDATE:
-               p = ptr;
-
-               /* We check the device is present since we don't care about
-                * changes in the default table, we only care about changes
-                * done per device delay prob time parameter.
-                */
-#if IS_ENABLED(CONFIG_IPV6)
-               if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != &arp_tbl))
-#else
-               if (!p->dev || p->tbl != &arp_tbl)
-#endif
-                       return NOTIFY_DONE;
-
-               rcu_read_lock();
-               list_for_each_entry_rcu(nhe, &neigh_update->neigh_list,
-                                       neigh_list) {
-                       if (p->dev == nhe->m_neigh.dev) {
-                               found = true;
-                               break;
-                       }
-               }
-               rcu_read_unlock();
-               if (!found)
-                       return NOTIFY_DONE;
-
-               neigh_update->min_interval = min_t(unsigned long,
-                                                  NEIGH_VAR(p, DELAY_PROBE_TIME),
-                                                  neigh_update->min_interval);
-               mlx5_fc_update_sampling_interval(priv->mdev,
-                                                neigh_update->min_interval);
-               break;
-       }
-       return NOTIFY_DONE;
-}
-
-static const struct rhashtable_params mlx5e_neigh_ht_params = {
-       .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
-       .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
-       .key_len = sizeof(struct mlx5e_neigh),
-       .automatic_shrinking = true,
-};
-
-static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
-{
-       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
-       int err;
-
-       err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
-       if (err)
-               return err;
-
-       INIT_LIST_HEAD(&neigh_update->neigh_list);
-       mutex_init(&neigh_update->encap_lock);
-       INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
-                         mlx5e_rep_neigh_stats_work);
-       mlx5e_rep_neigh_update_init_interval(rpriv);
-
-       rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
-       err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
-       if (err)
-               goto out_err;
-       return 0;
-
-out_err:
-       rhashtable_destroy(&neigh_update->neigh_ht);
-       return err;
-}
-
-static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
-{
-       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
-       struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
-
-       unregister_netevent_notifier(&neigh_update->netevent_nb);
-
-       flush_workqueue(priv->wq); /* flush neigh update works */
-
-       cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
-
-       mutex_destroy(&neigh_update->encap_lock);
-       rhashtable_destroy(&neigh_update->neigh_ht);
-}
-
-static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv,
-                                       struct mlx5e_neigh_hash_entry *nhe)
-{
-       struct mlx5e_rep_priv *rpriv = priv->ppriv;
-       int err;
-
-       err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht,
-                                    &nhe->rhash_node,
-                                    mlx5e_neigh_ht_params);
-       if (err)
-               return err;
-
-       list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list);
-
-       return err;
-}
-
-static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe)
-{
-       struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv;
-
-       mutex_lock(&rpriv->neigh_update.encap_lock);
-
-       list_del_rcu(&nhe->neigh_list);
-
-       rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
-                              &nhe->rhash_node,
-                              mlx5e_neigh_ht_params);
-       mutex_unlock(&rpriv->neigh_update.encap_lock);
-}
-
-/* This function must only be called under the representor's encap_lock or
- * inside rcu read lock section.
- */
-static struct mlx5e_neigh_hash_entry *
-mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
-                            struct mlx5e_neigh *m_neigh)
-{
-       struct mlx5e_rep_priv *rpriv = priv->ppriv;
-       struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
-       struct mlx5e_neigh_hash_entry *nhe;
-
-       nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh,
-                                    mlx5e_neigh_ht_params);
-       return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL;
-}
-
-static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
-                                       struct mlx5e_encap_entry *e,
-                                       struct mlx5e_neigh_hash_entry **nhe)
-{
-       int err;
-
-       *nhe = kzalloc(sizeof(**nhe), GFP_KERNEL);
-       if (!*nhe)
-               return -ENOMEM;
-
-       (*nhe)->priv = priv;
-       memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh));
-       INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update);
-       spin_lock_init(&(*nhe)->encap_list_lock);
-       INIT_LIST_HEAD(&(*nhe)->encap_list);
-       refcount_set(&(*nhe)->refcnt, 1);
-
-       err = mlx5e_rep_neigh_entry_insert(priv, *nhe);
-       if (err)
-               goto out_free;
-       return 0;
-
-out_free:
-       kfree(*nhe);
-       return err;
-}
-
-int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
-                                struct mlx5e_encap_entry *e)
-{
-       struct mlx5e_rep_priv *rpriv = priv->ppriv;
-       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
-       struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
-       struct mlx5e_neigh_hash_entry *nhe;
-       int err;
-
-       err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type);
-       if (err)
-               return err;
-
-       mutex_lock(&rpriv->neigh_update.encap_lock);
-       nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
-       if (!nhe) {
-               err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
-               if (err) {
-                       mutex_unlock(&rpriv->neigh_update.encap_lock);
-                       mlx5_tun_entropy_refcount_dec(tun_entropy,
-                                                     e->reformat_type);
-                       return err;
-               }
-       }
-
-       e->nhe = nhe;
-       spin_lock(&nhe->encap_list_lock);
-       list_add_rcu(&e->encap_list, &nhe->encap_list);
-       spin_unlock(&nhe->encap_list_lock);
-
-       mutex_unlock(&rpriv->neigh_update.encap_lock);
-
-       return 0;
-}
-
-void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
-                                 struct mlx5e_encap_entry *e)
-{
-       struct mlx5e_rep_priv *rpriv = priv->ppriv;
-       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
-       struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy;
-
-       if (!e->nhe)
-               return;
-
-       spin_lock(&e->nhe->encap_list_lock);
-       list_del_rcu(&e->encap_list);
-       spin_unlock(&e->nhe->encap_list_lock);
-
-       mlx5e_rep_neigh_entry_release(e->nhe);
-       e->nhe = NULL;
-       mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type);
-}
-
 static int mlx5e_rep_open(struct net_device *dev)
 {
        struct mlx5e_priv *priv = netdev_priv(dev);
@@ -1225,129 +514,6 @@ static int mlx5e_rep_close(struct net_device *dev)
        return ret;
 }
 
-static int
-mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
-                             struct flow_cls_offload *cls_flower, int flags)
-{
-       switch (cls_flower->command) {
-       case FLOW_CLS_REPLACE:
-               return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
-                                             flags);
-       case FLOW_CLS_DESTROY:
-               return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
-                                          flags);
-       case FLOW_CLS_STATS:
-               return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
-                                         flags);
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static
-int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv,
-                                   struct tc_cls_matchall_offload *ma)
-{
-       switch (ma->command) {
-       case TC_CLSMATCHALL_REPLACE:
-               return mlx5e_tc_configure_matchall(priv, ma);
-       case TC_CLSMATCHALL_DESTROY:
-               return mlx5e_tc_delete_matchall(priv, ma);
-       case TC_CLSMATCHALL_STATS:
-               mlx5e_tc_stats_matchall(priv, ma);
-               return 0;
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
-                                void *cb_priv)
-{
-       unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD);
-       struct mlx5e_priv *priv = cb_priv;
-
-       switch (type) {
-       case TC_SETUP_CLSFLOWER:
-               return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags);
-       case TC_SETUP_CLSMATCHALL:
-               return mlx5e_rep_setup_tc_cls_matchall(priv, type_data);
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data,
-                                void *cb_priv)
-{
-       struct flow_cls_offload tmp, *f = type_data;
-       struct mlx5e_priv *priv = cb_priv;
-       struct mlx5_eswitch *esw;
-       unsigned long flags;
-       int err;
-
-       flags = MLX5_TC_FLAG(INGRESS) |
-               MLX5_TC_FLAG(ESW_OFFLOAD) |
-               MLX5_TC_FLAG(FT_OFFLOAD);
-       esw = priv->mdev->priv.eswitch;
-
-       switch (type) {
-       case TC_SETUP_CLSFLOWER:
-               memcpy(&tmp, f, sizeof(*f));
-
-               if (!mlx5_esw_chains_prios_supported(esw))
-                       return -EOPNOTSUPP;
-
-               /* Re-use tc offload path by moving the ft flow to the
-                * reserved ft chain.
-                *
-                * FT offload can use prio range [0, INT_MAX], so we normalize
-                * it to range [1, mlx5_esw_chains_get_prio_range(esw)]
-                * as with tc, where prio 0 isn't supported.
-                *
-                * We only support chain 0 of FT offload.
-                */
-               if (tmp.common.prio >= mlx5_esw_chains_get_prio_range(esw))
-                       return -EOPNOTSUPP;
-               if (tmp.common.chain_index != 0)
-                       return -EOPNOTSUPP;
-
-               tmp.common.chain_index = mlx5_esw_chains_get_ft_chain(esw);
-               tmp.common.prio++;
-               err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags);
-               memcpy(&f->stats, &tmp.stats, sizeof(f->stats));
-               return err;
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
-static LIST_HEAD(mlx5e_rep_block_tc_cb_list);
-static LIST_HEAD(mlx5e_rep_block_ft_cb_list);
-static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
-                             void *type_data)
-{
-       struct mlx5e_priv *priv = netdev_priv(dev);
-       struct flow_block_offload *f = type_data;
-
-       f->unlocked_driver_cb = true;
-
-       switch (type) {
-       case TC_SETUP_BLOCK:
-               return flow_block_cb_setup_simple(type_data,
-                                                 &mlx5e_rep_block_tc_cb_list,
-                                                 mlx5e_rep_setup_tc_cb,
-                                                 priv, priv, true);
-       case TC_SETUP_FT:
-               return flow_block_cb_setup_simple(type_data,
-                                                 &mlx5e_rep_block_ft_cb_list,
-                                                 mlx5e_rep_setup_ft_cb,
-                                                 priv, priv, true);
-       default:
-               return -EOPNOTSUPP;
-       }
-}
-
 bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv)
 {
        struct mlx5e_rep_priv *rpriv = priv->ppriv;
@@ -1791,31 +957,23 @@ static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv)
        priv = netdev_priv(netdev);
        uplink_priv = &rpriv->uplink_priv;
 
-       mutex_init(&uplink_priv->unready_flows_lock);
-       INIT_LIST_HEAD(&uplink_priv->unready_flows);
-
-       /* init shared tc flow table */
-       err = mlx5e_tc_esw_init(&uplink_priv->tc_ht);
+       err = mlx5e_rep_tc_init(rpriv);
        if (err)
                return err;
 
        mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev);
 
-       /* init indirect block notifications */
-       INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list);
-       uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event;
-       err = register_netdevice_notifier_dev_net(rpriv->netdev,
-                                                 &uplink_priv->netdevice_nb,
-                                                 &uplink_priv->netdevice_nn);
+       err = mlx5e_rep_tc_netdevice_event_register(rpriv);
        if (err) {
-               mlx5_core_err(priv->mdev, "Failed to register netdev notifier\n");
-               goto tc_esw_cleanup;
+               mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n",
+                             err);
+               goto tc_rep_cleanup;
        }
 
        return 0;
 
-tc_esw_cleanup:
-       mlx5e_tc_esw_cleanup(&uplink_priv->tc_ht);
+tc_rep_cleanup:
+       mlx5e_rep_tc_cleanup(rpriv);
        return err;
 }
 
@@ -1845,17 +1003,10 @@ destroy_tises:
 
 static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv)
 {
-       struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv;
-
-       /* clean indirect TC block notifications */
-       unregister_netdevice_notifier_dev_net(rpriv->netdev,
-                                             &uplink_priv->netdevice_nb,
-                                             &uplink_priv->netdevice_nn);
+       mlx5e_rep_tc_netdevice_event_unregister(rpriv);
        mlx5e_rep_indr_clean_block_privs(rpriv);
 
-       /* delete shared tc flow table */
-       mlx5e_tc_esw_cleanup(&rpriv->uplink_priv.tc_ht);
-       mutex_destroy(&rpriv->uplink_priv.unready_flows_lock);
+       mlx5e_rep_tc_cleanup(rpriv);
 }
 
 static void mlx5e_cleanup_rep_tx(struct mlx5e_priv *priv)
@@ -1897,13 +1048,8 @@ static int uplink_rep_async_event(struct notifier_block *nb, unsigned long event
                return NOTIFY_OK;
        }
 
-       if (event == MLX5_DEV_EVENT_PORT_AFFINITY) {
-               struct mlx5e_rep_priv *rpriv = priv->ppriv;
-
-               queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work);
-
-               return NOTIFY_OK;
-       }
+       if (event == MLX5_DEV_EVENT_PORT_AFFINITY)
+               return mlx5e_rep_tc_event_port_affinity(priv);
 
        return NOTIFY_DONE;
 }
@@ -1912,7 +1058,6 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
 {
        struct net_device *netdev = priv->netdev;
        struct mlx5_core_dev *mdev = priv->mdev;
-       struct mlx5e_rep_priv *rpriv = priv->ppriv;
        u16 max_mtu;
 
        netdev->min_mtu = ETH_MIN_MTU;
@@ -1920,8 +1065,7 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
        netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
        mlx5e_set_dev_port_mtu(priv);
 
-       INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work,
-                 mlx5e_tc_reoffload_flows_work);
+       mlx5e_rep_tc_enable(priv);
 
        mlx5_lag_add(mdev, netdev);
        priv->events_nb.notifier_call = uplink_rep_async_event;
@@ -1933,11 +1077,10 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
 static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
 {
        struct mlx5_core_dev *mdev = priv->mdev;
-       struct mlx5e_rep_priv *rpriv = priv->ppriv;
 
        mlx5e_dcbnl_delete_app(priv);
        mlx5_notifier_unregister(mdev, &priv->events_nb);
-       cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
+       mlx5e_rep_tc_disable(priv);
        mlx5_lag_remove(mdev);
 }
 
index 6a233790042002166088aaf73accf459ee5adaae..93e911baacad32d0e33be9ebebe0cb6638d80713 100644 (file)
@@ -158,6 +158,22 @@ struct mlx5e_neigh_hash_entry {
 enum {
        /* set when the encap entry is successfully offloaded into HW */
        MLX5_ENCAP_ENTRY_VALID     = BIT(0),
+       MLX5_REFORMAT_DECAP        = BIT(1),
+};
+
+struct mlx5e_decap_key {
+       struct ethhdr key;
+};
+
+struct mlx5e_decap_entry {
+       struct mlx5e_decap_key key;
+       struct list_head flows;
+       struct hlist_node hlist;
+       refcount_t refcnt;
+       struct completion res_ready;
+       int compl_result;
+       struct mlx5_pkt_reformat *pkt_reformat;
+       struct rcu_head rcu;
 };
 
 struct mlx5e_encap_entry {
@@ -203,11 +219,6 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq,
                                   struct mlx5_cqe64 *cqe);
 
-int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
-                                struct mlx5e_encap_entry *e);
-void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
-                                 struct mlx5e_encap_entry *e);
-
 void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
 
 bool mlx5e_eswitch_rep(struct net_device *netdev);
index a514685fb560d55db37b27965dccf8790357965d..6b3c82da199ce9ed85b6b42d6e45555c2529ddd9 100644 (file)
@@ -42,6 +42,7 @@
 #include "en_tc.h"
 #include "eswitch.h"
 #include "en_rep.h"
+#include "en/rep/tc.h"
 #include "ipoib/ipoib.h"
 #include "en_accel/ipsec_rxtx.h"
 #include "en_accel/tls_rxtx.h"
@@ -300,7 +301,7 @@ static inline void mlx5e_page_release(struct mlx5e_rq *rq,
                 * put into the Reuse Ring, because there is no way to return
                 * the page to the userspace when the interface goes down.
                 */
-               mlx5e_xsk_page_release(rq, dma_info);
+               xsk_buff_free(dma_info->xsk);
        else
                mlx5e_page_release_dynamic(rq, dma_info, recycle);
 }
@@ -385,7 +386,11 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
        if (rq->umem) {
                int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
 
-               if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
+               /* Check in advance that we have enough frames, instead of
+                * allocating one-by-one, failing and moving frames to the
+                * Reuse Ring.
+                */
+               if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired)))
                        return -ENOMEM;
        }
 
@@ -480,8 +485,11 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
        int err;
        int i;
 
+       /* Check in advance that we have enough frames, instead of allocating
+        * one-by-one, failing and moving frames to the Reuse Ring.
+        */
        if (rq->umem &&
-           unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
+           unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) {
                err = -ENOMEM;
                goto err;
        }
@@ -1044,12 +1052,24 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
        return skb;
 }
 
+static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
+                               u32 len, struct xdp_buff *xdp)
+{
+       xdp->data_hard_start = va;
+       xdp_set_data_meta_invalid(xdp);
+       xdp->data = va + headroom;
+       xdp->data_end = xdp->data + len;
+       xdp->rxq = &rq->xdp_rxq;
+       xdp->frame_sz = rq->buff.frame0_sz;
+}
+
 struct sk_buff *
 mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
                          struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
        struct mlx5e_dma_info *di = wi->di;
        u16 rx_headroom = rq->buff.headroom;
+       struct xdp_buff xdp;
        struct sk_buff *skb;
        void *va, *data;
        bool consumed;
@@ -1065,11 +1085,13 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
        prefetch(data);
 
        rcu_read_lock();
-       consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
+       mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
+       consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp);
        rcu_read_unlock();
        if (consumed)
                return NULL; /* page/packet was consumed by XDP */
 
+       rx_headroom = xdp.data - xdp.data_hard_start;
        frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
        skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
        if (unlikely(!skb))
@@ -1216,12 +1238,12 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
        if (rep->vlan && skb_vlan_tag_present(skb))
                skb_vlan_pop(skb);
 
-       if (!mlx5e_tc_rep_update_skb(cqe, skb, &tc_priv))
+       if (!mlx5e_rep_tc_update_skb(cqe, skb, &tc_priv))
                goto free_wqe;
 
        napi_gro_receive(rq->cq.napi, skb);
 
-       mlx5_tc_rep_post_napi_receive(&tc_priv);
+       mlx5_rep_tc_post_napi_receive(&tc_priv);
 
 free_wqe:
        mlx5e_free_rx_wqe(rq, wi, true);
@@ -1272,12 +1294,12 @@ void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq,
 
        mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 
-       if (!mlx5e_tc_rep_update_skb(cqe, skb, &tc_priv))
+       if (!mlx5e_rep_tc_update_skb(cqe, skb, &tc_priv))
                goto mpwrq_cqe_out;
 
        napi_gro_receive(rq->cq.napi, skb);
 
-       mlx5_tc_rep_post_napi_receive(&tc_priv);
+       mlx5_rep_tc_post_napi_receive(&tc_priv);
 
 mpwrq_cqe_out:
        if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
@@ -1343,6 +1365,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
        struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
        u16 rx_headroom = rq->buff.headroom;
        u32 cqe_bcnt32 = cqe_bcnt;
+       struct xdp_buff xdp;
        struct sk_buff *skb;
        void *va, *data;
        u32 frag_size;
@@ -1364,7 +1387,8 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
        prefetch(data);
 
        rcu_read_lock();
-       consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
+       mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp);
+       consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp);
        rcu_read_unlock();
        if (consumed) {
                if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
@@ -1372,6 +1396,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
                return NULL; /* page/packet was consumed by XDP */
        }
 
+       rx_headroom = xdp.data - xdp.data_hard_start;
        frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
        skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
        if (unlikely(!skb))
index a050808f2128dfbaca16a5af05a5e0d7e01588ea..cc669ea450aeb66fcd04de2d677379e404401cb3 100644 (file)
@@ -31,6 +31,7 @@
  */
 
 #include <net/flow_dissector.h>
+#include <net/flow_offload.h>
 #include <net/sch_generic.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_tunnel_key.h>
 #include <net/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_csum.h>
+#include <net/tc_act/tc_mpls.h>
 #include <net/arp.h>
 #include <net/ipv6_stubs.h>
+#include <net/bareudp.h>
 #include "en.h"
 #include "en_rep.h"
+#include "en/rep/tc.h"
+#include "en/rep/neigh.h"
 #include "en_tc.h"
 #include "eswitch.h"
 #include "esw/chains.h"
@@ -89,6 +94,7 @@ enum {
        MLX5E_TC_FLOW_FLAG_NOT_READY    = MLX5E_TC_FLOW_BASE + 5,
        MLX5E_TC_FLOW_FLAG_DELETED      = MLX5E_TC_FLOW_BASE + 6,
        MLX5E_TC_FLOW_FLAG_CT           = MLX5E_TC_FLOW_BASE + 7,
+       MLX5E_TC_FLOW_FLAG_L3_TO_L2_DECAP = MLX5E_TC_FLOW_BASE + 8,
 };
 
 #define MLX5E_TC_MAX_SPLITS 1
@@ -122,6 +128,11 @@ struct mlx5e_tc_flow {
        u64                     cookie;
        unsigned long           flags;
        struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
+
+       /* flows sharing the same reformat object - currently mpls decap */
+       struct list_head l3_to_l2_reformat;
+       struct mlx5e_decap_entry *decap_reformat;
+
        /* Flow can be associated with multiple encap IDs.
         * The number of encaps is bounded by the number of supported
         * destinations.
@@ -153,40 +164,12 @@ struct mlx5e_tc_flow_parse_attr {
        struct mlx5_flow_spec spec;
        struct mlx5e_tc_mod_hdr_acts mod_hdr_acts;
        int mirred_ifindex[MLX5_MAX_FLOW_FWD_VPORTS];
+       struct ethhdr eth;
 };
 
 #define MLX5E_TC_TABLE_NUM_GROUPS 4
 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(16)
 
-struct tunnel_match_key {
-       struct flow_dissector_key_control enc_control;
-       struct flow_dissector_key_keyid enc_key_id;
-       struct flow_dissector_key_ports enc_tp;
-       struct flow_dissector_key_ip enc_ip;
-       union {
-               struct flow_dissector_key_ipv4_addrs enc_ipv4;
-               struct flow_dissector_key_ipv6_addrs enc_ipv6;
-       };
-
-       int filter_ifindex;
-};
-
-struct tunnel_match_enc_opts {
-       struct flow_dissector_key_enc_opts key;
-       struct flow_dissector_key_enc_opts mask;
-};
-
-/* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS.
- * Upper TUNNEL_INFO_BITS for general tunnel info.
- * Lower ENC_OPTS_BITS bits for enc_opts.
- */
-#define TUNNEL_INFO_BITS 6
-#define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0)
-#define ENC_OPTS_BITS 2
-#define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0)
-#define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS)
-#define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0)
-
 struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = {
        [CHAIN_TO_REG] = {
                .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0,
@@ -1149,6 +1132,11 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
                              struct netlink_ext_ack *extack,
                              struct net_device **encap_dev,
                              bool *encap_valid);
+static int mlx5e_attach_decap(struct mlx5e_priv *priv,
+                             struct mlx5e_tc_flow *flow,
+                             struct netlink_ext_ack *extack);
+static void mlx5e_detach_decap(struct mlx5e_priv *priv,
+                              struct mlx5e_tc_flow *flow);
 
 static struct mlx5_flow_handle *
 mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw,
@@ -1324,6 +1312,12 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
                return -EOPNOTSUPP;
        }
 
+       if (flow_flag_test(flow, L3_TO_L2_DECAP)) {
+               err = mlx5e_attach_decap(priv, flow, extack);
+               if (err)
+                       return err;
+       }
+
        for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) {
                int mirred_ifindex;
 
@@ -1433,6 +1427,9 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 
        if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
                mlx5_fc_destroy(attr->counter_dev, attr->counter);
+
+       if (flow_flag_test(flow, L3_TO_L2_DECAP))
+               mlx5e_detach_decap(priv, flow);
 }
 
 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
@@ -1709,6 +1706,17 @@ static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entr
        kfree_rcu(e, rcu);
 }
 
+static void mlx5e_decap_dealloc(struct mlx5e_priv *priv,
+                               struct mlx5e_decap_entry *d)
+{
+       WARN_ON(!list_empty(&d->flows));
+
+       if (!d->compl_result)
+               mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat);
+
+       kfree_rcu(d, rcu);
+}
+
 void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
 {
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
@@ -1721,6 +1729,18 @@ void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
        mlx5e_encap_dealloc(priv, e);
 }
 
+static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+       if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock))
+               return;
+       hash_del_rcu(&d->hlist);
+       mutex_unlock(&esw->offloads.decap_tbl_lock);
+
+       mlx5e_decap_dealloc(priv, d);
+}
+
 static void mlx5e_detach_encap(struct mlx5e_priv *priv,
                               struct mlx5e_tc_flow *flow, int out_index)
 {
@@ -1744,6 +1764,29 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv,
        mlx5e_encap_dealloc(priv, e);
 }
 
+static void mlx5e_detach_decap(struct mlx5e_priv *priv,
+                              struct mlx5e_tc_flow *flow)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+       struct mlx5e_decap_entry *d = flow->decap_reformat;
+
+       if (!d)
+               return;
+
+       mutex_lock(&esw->offloads.decap_tbl_lock);
+       list_del(&flow->l3_to_l2_reformat);
+       flow->decap_reformat = NULL;
+
+       if (!refcount_dec_and_test(&d->refcnt)) {
+               mutex_unlock(&esw->offloads.decap_tbl_lock);
+               return;
+       }
+       hash_del_rcu(&d->hlist);
+       mutex_unlock(&esw->offloads.decap_tbl_lock);
+
+       mlx5e_decap_dealloc(priv, d);
+}
+
 static void __mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow)
 {
        struct mlx5_eswitch *esw = flow->priv->mdev->priv.eswitch;
@@ -2015,7 +2058,11 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
                        return err;
                }
 
-               flow->esw_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+               /* With mpls over udp we decapsulate using packet reformat
+                * object
+                */
+               if (!netif_is_bareudp(filter_dev))
+                       flow->esw_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
        }
 
        if (!needs_mapping && !sets_mapping)
@@ -2098,6 +2145,20 @@ static int mlx5e_flower_parse_meta(struct net_device *filter_dev,
        return 0;
 }
 
+static bool skip_key_basic(struct net_device *filter_dev,
+                          struct flow_cls_offload *f)
+{
+       /* When doing mpls over udp decap, the user needs to provide
+        * MPLS_UC as the protocol in order to be able to match on mpls
+        * label fields.  However, the actual ethertype is IP so we want to
+        * avoid matching on this, otherwise we'll fail the match.
+        */
+       if (netif_is_bareudp(filter_dev) && f->common.chain_index == 0)
+               return true;
+
+       return false;
+}
+
 static int __parse_cls_flower(struct mlx5e_priv *priv,
                              struct mlx5e_tc_flow *flow,
                              struct mlx5_flow_spec *spec,
@@ -2142,7 +2203,8 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
              BIT(FLOW_DISSECTOR_KEY_IP)  |
              BIT(FLOW_DISSECTOR_KEY_CT) |
              BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
-             BIT(FLOW_DISSECTOR_KEY_ENC_OPTS))) {
+             BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) |
+             BIT(FLOW_DISSECTOR_KEY_MPLS))) {
                NL_SET_ERR_MSG_MOD(extack, "Unsupported key");
                netdev_warn(priv->netdev, "Unsupported key used: 0x%x\n",
                            dissector->used_keys);
@@ -2172,7 +2234,8 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
        if (err)
                return err;
 
-       if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
+       if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC) &&
+           !skip_key_basic(filter_dev, f)) {
                struct flow_match_basic match;
 
                flow_rule_match_basic(rule, &match);
@@ -2837,10 +2900,12 @@ void dealloc_mod_hdr_actions(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts)
 
 static const struct pedit_headers zero_masks = {};
 
-static int parse_tc_pedit_action(struct mlx5e_priv *priv,
-                                const struct flow_action_entry *act, int namespace,
-                                struct pedit_headers_action *hdrs,
-                                struct netlink_ext_ack *extack)
+static int
+parse_pedit_to_modify_hdr(struct mlx5e_priv *priv,
+                         const struct flow_action_entry *act, int namespace,
+                         struct mlx5e_tc_flow_parse_attr *parse_attr,
+                         struct pedit_headers_action *hdrs,
+                         struct netlink_ext_ack *extack)
 {
        u8 cmd = (act->id == FLOW_ACTION_MANGLE) ? 0 : 1;
        int err = -EOPNOTSUPP;
@@ -2876,6 +2941,46 @@ out_err:
        return err;
 }
 
+static int
+parse_pedit_to_reformat(struct mlx5e_priv *priv,
+                       const struct flow_action_entry *act,
+                       struct mlx5e_tc_flow_parse_attr *parse_attr,
+                       struct netlink_ext_ack *extack)
+{
+       u32 mask, val, offset;
+       u32 *p;
+
+       if (act->id != FLOW_ACTION_MANGLE)
+               return -EOPNOTSUPP;
+
+       if (act->mangle.htype != FLOW_ACT_MANGLE_HDR_TYPE_ETH) {
+               NL_SET_ERR_MSG_MOD(extack, "Only Ethernet modification is supported");
+               return -EOPNOTSUPP;
+       }
+
+       mask = ~act->mangle.mask;
+       val = act->mangle.val;
+       offset = act->mangle.offset;
+       p = (u32 *)&parse_attr->eth;
+       *(p + (offset >> 2)) |= (val & mask);
+
+       return 0;
+}
+
+static int parse_tc_pedit_action(struct mlx5e_priv *priv,
+                                const struct flow_action_entry *act, int namespace,
+                                struct mlx5e_tc_flow_parse_attr *parse_attr,
+                                struct pedit_headers_action *hdrs,
+                                struct mlx5e_tc_flow *flow,
+                                struct netlink_ext_ack *extack)
+{
+       if (flow && flow_flag_test(flow, L3_TO_L2_DECAP))
+               return parse_pedit_to_reformat(priv, act, parse_attr, extack);
+
+       return parse_pedit_to_modify_hdr(priv, act, namespace,
+                                        parse_attr, hdrs, extack);
+}
+
 static int alloc_tc_pedit_action(struct mlx5e_priv *priv, int namespace,
                                 struct mlx5e_tc_flow_parse_attr *parse_attr,
                                 struct pedit_headers_action *hdrs,
@@ -3134,7 +3239,7 @@ static int add_vlan_rewrite_action(struct mlx5e_priv *priv, int namespace,
                return -EOPNOTSUPP;
        }
 
-       err = parse_tc_pedit_action(priv, &pedit_act, namespace, hdrs, NULL);
+       err = parse_tc_pedit_action(priv, &pedit_act, namespace, parse_attr, hdrs, NULL, extack);
        *action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
 
        return err;
@@ -3200,7 +3305,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv,
                case FLOW_ACTION_MANGLE:
                case FLOW_ACTION_ADD:
                        err = parse_tc_pedit_action(priv, act, MLX5_FLOW_NAMESPACE_KERNEL,
-                                                   hdrs, extack);
+                                                   parse_attr, hdrs, NULL, extack);
                        if (err)
                                return err;
 
@@ -3294,12 +3399,22 @@ static inline int cmp_encap_info(struct encap_key *a,
               a->tc_tunnel->tunnel_type != b->tc_tunnel->tunnel_type;
 }
 
+static inline int cmp_decap_info(struct mlx5e_decap_key *a,
+                                struct mlx5e_decap_key *b)
+{
+       return memcmp(&a->key, &b->key, sizeof(b->key));
+}
+
 static inline int hash_encap_info(struct encap_key *key)
 {
        return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key),
                     key->tc_tunnel->tunnel_type);
 }
 
+static inline int hash_decap_info(struct mlx5e_decap_key *key)
+{
+       return jhash(&key->key, sizeof(key->key), 0);
+}
 
 static bool is_merged_eswitch_dev(struct mlx5e_priv *priv,
                                  struct net_device *peer_netdev)
@@ -3314,13 +3429,16 @@ static bool is_merged_eswitch_dev(struct mlx5e_priv *priv,
                same_hw_devs(priv, peer_priv));
 }
 
-
-
 bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
 {
        return refcount_inc_not_zero(&e->refcnt);
 }
 
+static bool mlx5e_decap_take(struct mlx5e_decap_entry *e)
+{
+       return refcount_inc_not_zero(&e->refcnt);
+}
+
 static struct mlx5e_encap_entry *
 mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key,
                uintptr_t hash_key)
@@ -3341,6 +3459,24 @@ mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key,
        return NULL;
 }
 
+static struct mlx5e_decap_entry *
+mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key,
+               uintptr_t hash_key)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+       struct mlx5e_decap_key r_key;
+       struct mlx5e_decap_entry *e;
+
+       hash_for_each_possible_rcu(esw->offloads.decap_tbl, e,
+                                  hlist, hash_key) {
+               r_key = e->key;
+               if (!cmp_decap_info(&r_key, key) &&
+                   mlx5e_decap_take(e))
+                       return e;
+       }
+       return NULL;
+}
+
 static struct ip_tunnel_info *dup_tun_info(const struct ip_tunnel_info *tun_info)
 {
        size_t tun_size = sizeof(*tun_info) + tun_info->options_len;
@@ -3486,6 +3622,84 @@ out_err_init:
        return err;
 }
 
+static int mlx5e_attach_decap(struct mlx5e_priv *priv,
+                             struct mlx5e_tc_flow *flow,
+                             struct netlink_ext_ack *extack)
+{
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+       struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+       struct mlx5e_tc_flow_parse_attr *parse_attr;
+       struct mlx5e_decap_entry *d;
+       struct mlx5e_decap_key key;
+       uintptr_t hash_key;
+       int err;
+
+       parse_attr = attr->parse_attr;
+       if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "encap header larger than max supported");
+               return -EOPNOTSUPP;
+       }
+
+       key.key = parse_attr->eth;
+       hash_key = hash_decap_info(&key);
+       mutex_lock(&esw->offloads.decap_tbl_lock);
+       d = mlx5e_decap_get(priv, &key, hash_key);
+       if (d) {
+               mutex_unlock(&esw->offloads.decap_tbl_lock);
+               wait_for_completion(&d->res_ready);
+               mutex_lock(&esw->offloads.decap_tbl_lock);
+               if (d->compl_result) {
+                       err = -EREMOTEIO;
+                       goto out_free;
+               }
+               goto found;
+       }
+
+       d = kzalloc(sizeof(*d), GFP_KERNEL);
+       if (!d) {
+               err = -ENOMEM;
+               goto out_err;
+       }
+
+       d->key = key;
+       refcount_set(&d->refcnt, 1);
+       init_completion(&d->res_ready);
+       INIT_LIST_HEAD(&d->flows);
+       hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
+       mutex_unlock(&esw->offloads.decap_tbl_lock);
+
+       d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
+                                                    MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2,
+                                                    sizeof(parse_attr->eth),
+                                                    &parse_attr->eth,
+                                                    MLX5_FLOW_NAMESPACE_FDB);
+       if (IS_ERR(d->pkt_reformat)) {
+               err = PTR_ERR(d->pkt_reformat);
+               d->compl_result = err;
+       }
+       mutex_lock(&esw->offloads.decap_tbl_lock);
+       complete_all(&d->res_ready);
+       if (err)
+               goto out_free;
+
+found:
+       flow->decap_reformat = d;
+       attr->decap_pkt_reformat = d->pkt_reformat;
+       list_add(&flow->l3_to_l2_reformat, &d->flows);
+       mutex_unlock(&esw->offloads.decap_tbl_lock);
+       return 0;
+
+out_free:
+       mutex_unlock(&esw->offloads.decap_tbl_lock);
+       mlx5e_decap_put(priv, d);
+       return err;
+
+out_err:
+       mutex_unlock(&esw->offloads.decap_tbl_lock);
+       return err;
+}
+
 static int parse_tc_vlan_action(struct mlx5e_priv *priv,
                                const struct flow_action_entry *act,
                                struct mlx5_esw_flow_attr *attr,
@@ -3697,7 +3911,8 @@ static int verify_uplink_forwarding(struct mlx5e_priv *priv,
 static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
                                struct flow_action *flow_action,
                                struct mlx5e_tc_flow *flow,
-                               struct netlink_ext_ack *extack)
+                               struct netlink_ext_ack *extack,
+                               struct net_device *filter_dev)
 {
        struct pedit_headers_action hdrs[2] = {};
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
@@ -3711,6 +3926,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
        bool encap = false, decap = false;
        u32 action = attr->action;
        int err, i, if_count = 0;
+       bool mpls_push = false;
 
        if (!flow_action_has_entries(flow_action))
                return -EINVAL;
@@ -3725,15 +3941,48 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
                        action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
                                  MLX5_FLOW_CONTEXT_ACTION_COUNT;
                        break;
+               case FLOW_ACTION_MPLS_PUSH:
+                       if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev,
+                                                       reformat_l2_to_l3_tunnel) ||
+                           act->mpls_push.proto != htons(ETH_P_MPLS_UC)) {
+                               NL_SET_ERR_MSG_MOD(extack,
+                                                  "mpls push is supported only for mpls_uc protocol");
+                               return -EOPNOTSUPP;
+                       }
+                       mpls_push = true;
+                       break;
+               case FLOW_ACTION_MPLS_POP:
+                       /* we only support mpls pop if it is the first action
+                        * and the filter net device is bareudp. Subsequent
+                        * actions can be pedit and the last can be mirred
+                        * egress redirect.
+                        */
+                       if (i) {
+                               NL_SET_ERR_MSG_MOD(extack,
+                                                  "mpls pop supported only as first action");
+                               return -EOPNOTSUPP;
+                       }
+                       if (!netif_is_bareudp(filter_dev)) {
+                               NL_SET_ERR_MSG_MOD(extack,
+                                                  "mpls pop supported only on bareudp devices");
+                               return -EOPNOTSUPP;
+                       }
+
+                       parse_attr->eth.h_proto = act->mpls_pop.proto;
+                       action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+                       flow_flag_set(flow, L3_TO_L2_DECAP);
+                       break;
                case FLOW_ACTION_MANGLE:
                case FLOW_ACTION_ADD:
                        err = parse_tc_pedit_action(priv, act, MLX5_FLOW_NAMESPACE_FDB,
-                                                   hdrs, extack);
+                                                   parse_attr, hdrs, flow, extack);
                        if (err)
                                return err;
 
-                       action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
-                       attr->split_count = attr->out_count;
+                       if (!flow_flag_test(flow, L3_TO_L2_DECAP)) {
+                               action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+                               attr->split_count = attr->out_count;
+                       }
                        break;
                case FLOW_ACTION_CSUM:
                        if (csum_offload_supported(priv, action,
@@ -3755,6 +4004,12 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
                                return -EINVAL;
                        }
 
+                       if (mpls_push && !netif_is_bareudp(out_dev)) {
+                               NL_SET_ERR_MSG_MOD(extack,
+                                                  "mpls is supported only through a bareudp device");
+                               return -EOPNOTSUPP;
+                       }
+
                        if (ft_flow && out_dev == priv->netdev) {
                                /* Ignore forward to self rules generated
                                 * by adding both mlx5 devs to the flow table
@@ -4085,6 +4340,7 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
                INIT_LIST_HEAD(&flow->encaps[out_index].list);
        INIT_LIST_HEAD(&flow->mod_hdr);
        INIT_LIST_HEAD(&flow->hairpin);
+       INIT_LIST_HEAD(&flow->l3_to_l2_reformat);
        refcount_set(&flow->refcnt, 1);
        init_completion(&flow->init_done);
 
@@ -4154,7 +4410,7 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
        if (err)
                goto err_free;
 
-       err = parse_tc_fdb_actions(priv, &rule->action, flow, extack);
+       err = parse_tc_fdb_actions(priv, &rule->action, flow, extack, filter_dev);
        if (err)
                goto err_free;
 
@@ -4806,148 +5062,35 @@ void mlx5e_tc_reoffload_flows_work(struct work_struct *work)
        mutex_unlock(&rpriv->unready_flows_lock);
 }
 
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
-static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb,
-                                struct mlx5e_tc_update_priv *tc_priv,
-                                u32 tunnel_id)
-{
-       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-       struct tunnel_match_enc_opts enc_opts = {};
-       struct mlx5_rep_uplink_priv *uplink_priv;
-       struct mlx5e_rep_priv *uplink_rpriv;
-       struct metadata_dst *tun_dst;
-       struct tunnel_match_key key;
-       u32 tun_id, enc_opts_id;
-       struct net_device *dev;
-       int err;
-
-       enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
-       tun_id = tunnel_id >> ENC_OPTS_BITS;
-
-       if (!tun_id)
-               return true;
-
-       uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
-       uplink_priv = &uplink_rpriv->uplink_priv;
-
-       err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key);
-       if (err) {
-               WARN_ON_ONCE(true);
-               netdev_dbg(priv->netdev,
-                          "Couldn't find tunnel for tun_id: %d, err: %d\n",
-                          tun_id, err);
-               return false;
-       }
-
-       if (enc_opts_id) {
-               err = mapping_find(uplink_priv->tunnel_enc_opts_mapping,
-                                  enc_opts_id, &enc_opts);
-               if (err) {
-                       netdev_dbg(priv->netdev,
-                                  "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n",
-                                  enc_opts_id, err);
-                       return false;
-               }
-       }
-
-       tun_dst = tun_rx_dst(enc_opts.key.len);
-       if (!tun_dst) {
-               WARN_ON_ONCE(true);
-               return false;
-       }
-
-       ip_tunnel_key_init(&tun_dst->u.tun_info.key,
-                          key.enc_ipv4.src, key.enc_ipv4.dst,
-                          key.enc_ip.tos, key.enc_ip.ttl,
-                          0, /* label */
-                          key.enc_tp.src, key.enc_tp.dst,
-                          key32_to_tunnel_id(key.enc_key_id.keyid),
-                          TUNNEL_KEY);
-
-       if (enc_opts.key.len)
-               ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
-                                       enc_opts.key.data,
-                                       enc_opts.key.len,
-                                       enc_opts.key.dst_opt_type);
-
-       skb_dst_set(skb, (struct dst_entry *)tun_dst);
-       dev = dev_get_by_index(&init_net, key.filter_ifindex);
-       if (!dev) {
-               netdev_dbg(priv->netdev,
-                          "Couldn't find tunnel device with ifindex: %d\n",
-                          key.filter_ifindex);
-               return false;
+static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
+                                    struct flow_cls_offload *cls_flower,
+                                    unsigned long flags)
+{
+       switch (cls_flower->command) {
+       case FLOW_CLS_REPLACE:
+               return mlx5e_configure_flower(priv->netdev, priv, cls_flower,
+                                             flags);
+       case FLOW_CLS_DESTROY:
+               return mlx5e_delete_flower(priv->netdev, priv, cls_flower,
+                                          flags);
+       case FLOW_CLS_STATS:
+               return mlx5e_stats_flower(priv->netdev, priv, cls_flower,
+                                         flags);
+       default:
+               return -EOPNOTSUPP;
        }
-
-       /* Set tun_dev so we do dev_put() after datapath */
-       tc_priv->tun_dev = dev;
-
-       skb->dev = dev;
-
-       return true;
 }
-#endif /* CONFIG_NET_TC_SKB_EXT */
 
-bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe,
-                            struct sk_buff *skb,
-                            struct mlx5e_tc_update_priv *tc_priv)
+int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+                           void *cb_priv)
 {
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
-       u32 chain = 0, reg_c0, reg_c1, tunnel_id, tuple_id;
-       struct mlx5_rep_uplink_priv *uplink_priv;
-       struct mlx5e_rep_priv *uplink_rpriv;
-       struct tc_skb_ext *tc_skb_ext;
-       struct mlx5_eswitch *esw;
-       struct mlx5e_priv *priv;
-       int tunnel_moffset;
-       int err;
-
-       reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK);
-       if (reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG)
-               reg_c0 = 0;
-       reg_c1 = be32_to_cpu(cqe->ft_metadata);
-
-       if (!reg_c0)
-               return true;
-
-       priv = netdev_priv(skb->dev);
-       esw = priv->mdev->priv.eswitch;
-
-       err = mlx5_eswitch_get_chain_for_tag(esw, reg_c0, &chain);
-       if (err) {
-               netdev_dbg(priv->netdev,
-                          "Couldn't find chain for chain tag: %d, err: %d\n",
-                          reg_c0, err);
-               return false;
-       }
-
-       if (chain) {
-               tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
-               if (!tc_skb_ext) {
-                       WARN_ON(1);
-                       return false;
-               }
-
-               tc_skb_ext->chain = chain;
+       unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD);
+       struct mlx5e_priv *priv = cb_priv;
 
-               tuple_id = reg_c1 & TUPLE_ID_MAX;
-
-               uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
-               uplink_priv = &uplink_rpriv->uplink_priv;
-               if (!mlx5e_tc_ct_restore_flow(uplink_priv, skb, tuple_id))
-                       return false;
+       switch (type) {
+       case TC_SETUP_CLSFLOWER:
+               return mlx5e_setup_tc_cls_flower(priv, type_data, flags);
+       default:
+               return -EOPNOTSUPP;
        }
-
-       tunnel_moffset = mlx5e_tc_attr_to_reg_mappings[TUNNEL_TO_REG].moffset;
-       tunnel_id = reg_c1 >> (8 * tunnel_moffset);
-       return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
-#endif /* CONFIG_NET_TC_SKB_EXT */
-
-       return true;
-}
-
-void mlx5_tc_rep_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv)
-{
-       if (tc_priv->tun_dev)
-               dev_put(tc_priv->tun_dev);
 }
index abdcfa4c4e0e3a6684ce786fe76852af312bd5a3..037aa73bf9abe53e160255c747b71bdd8a26b4e1 100644 (file)
 #define __MLX5_EN_TC_H__
 
 #include <net/pkt_cls.h>
+#include "en.h"
 
 #define MLX5E_TC_FLOW_ID_MASK 0x0000ffff
 
 #ifdef CONFIG_MLX5_ESWITCH
 
+struct tunnel_match_key {
+       struct flow_dissector_key_control enc_control;
+       struct flow_dissector_key_keyid enc_key_id;
+       struct flow_dissector_key_ports enc_tp;
+       struct flow_dissector_key_ip enc_ip;
+       union {
+               struct flow_dissector_key_ipv4_addrs enc_ipv4;
+               struct flow_dissector_key_ipv6_addrs enc_ipv6;
+       };
+
+       int filter_ifindex;
+};
+
+struct tunnel_match_enc_opts {
+       struct flow_dissector_key_enc_opts key;
+       struct flow_dissector_key_enc_opts mask;
+};
+
+/* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS.
+ * Upper TUNNEL_INFO_BITS for general tunnel info.
+ * Lower ENC_OPTS_BITS bits for enc_opts.
+ */
+#define TUNNEL_INFO_BITS 6
+#define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0)
+#define ENC_OPTS_BITS 2
+#define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0)
+#define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS)
+#define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0)
+
 enum {
        MLX5E_TC_FLAG_INGRESS_BIT,
        MLX5E_TC_FLAG_EGRESS_BIT,
@@ -50,9 +80,6 @@ enum {
 
 #define MLX5_TC_FLAG(flag) BIT(MLX5E_TC_FLAG_##flag##_BIT)
 
-int mlx5e_tc_nic_init(struct mlx5e_priv *priv);
-void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv);
-
 int mlx5e_tc_esw_init(struct rhashtable *tc_ht);
 void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht);
 
@@ -119,11 +146,6 @@ struct mlx5e_tc_update_priv {
        struct net_device *tun_dev;
 };
 
-bool mlx5e_tc_rep_update_skb(struct mlx5_cqe64 *cqe, struct sk_buff *skb,
-                            struct mlx5e_tc_update_priv *tc_priv);
-
-void mlx5_tc_rep_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv);
-
 struct mlx5e_tc_mod_hdr_acts {
        int num_actions;
        int max_actions;
@@ -148,6 +170,22 @@ void dealloc_mod_hdr_actions(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts);
 struct mlx5e_tc_flow;
 u32 mlx5e_tc_get_flow_tun_id(struct mlx5e_tc_flow *flow);
 
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
+int mlx5e_tc_nic_init(struct mlx5e_priv *priv);
+void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv);
+
+int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+                           void *cb_priv);
+
+#else /* CONFIG_MLX5_CLS_ACT */
+static inline int  mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {}
+static inline int
+mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{ return -EOPNOTSUPP; }
+#endif /* CONFIG_MLX5_CLS_ACT */
+
 #else /* CONFIG_MLX5_ESWITCH */
 static inline int  mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
 static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {}
@@ -156,6 +194,10 @@ static inline int  mlx5e_tc_num_filters(struct mlx5e_priv *priv,
 {
        return 0;
 }
+
+static inline int
+mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{ return -EOPNOTSUPP; }
 #endif
 
 #endif /* __MLX5_EN_TC_H__ */
index f8c4239846ead8598fac1f80e67804e093f62ee1..7679ac359e315a8b8522496a4a63da3a22ea300b 100644 (file)
@@ -6,6 +6,8 @@
 
 #include "eswitch.h"
 
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
+
 bool
 mlx5_esw_chains_prios_supported(struct mlx5_eswitch *esw);
 bool
@@ -46,4 +48,21 @@ void mlx5_esw_chains_destroy(struct mlx5_eswitch *esw);
 int
 mlx5_eswitch_get_chain_for_tag(struct mlx5_eswitch *esw, u32 tag, u32 *chain);
 
+#else /* CONFIG_MLX5_CLS_ACT */
+
+static inline struct mlx5_flow_table *
+mlx5_esw_chains_get_table(struct mlx5_eswitch *esw, u32 chain, u32 prio,
+                         u32 level) { return ERR_PTR(-EOPNOTSUPP); }
+static inline void
+mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio,
+                         u32 level) {}
+
+static inline struct mlx5_flow_table *
+mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw) { return ERR_PTR(-EOPNOTSUPP); }
+
+static inline int mlx5_esw_chains_create(struct mlx5_eswitch *esw) { return 0; }
+static inline void mlx5_esw_chains_destroy(struct mlx5_eswitch *esw) {}
+
+#endif /* CONFIG_MLX5_CLS_ACT */
+
 #endif /* __ML5_ESW_CHAINS_H__ */
index c5eb4e7754a9ef3e6f7303a7cd0cc3c9bf03be18..ac79b7c9aeb3b16b03e8e74888c34a450e2302b9 100644 (file)
@@ -2262,6 +2262,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
        hash_init(esw->offloads.encap_tbl);
        mutex_init(&esw->offloads.mod_hdr.lock);
        hash_init(esw->offloads.mod_hdr.hlist);
+       mutex_init(&esw->offloads.decap_tbl_lock);
+       hash_init(esw->offloads.decap_tbl);
        atomic64_set(&esw->offloads.num_flows, 0);
        mutex_init(&esw->state_lock);
        mutex_init(&esw->mode_lock);
@@ -2303,6 +2305,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
        mutex_destroy(&esw->state_lock);
        mutex_destroy(&esw->offloads.mod_hdr.lock);
        mutex_destroy(&esw->offloads.encap_tbl_lock);
+       mutex_destroy(&esw->offloads.decap_tbl_lock);
        kfree(esw->vports);
        kfree(esw);
 }
index 4a1c6c78bb145491e1a1f57de0041878cf283d27..ccbbea3e050538b20a3f411ec4a54f861a9344b2 100644 (file)
@@ -209,6 +209,8 @@ struct mlx5_esw_offload {
        struct mutex peer_mutex;
        struct mutex encap_tbl_lock; /* protects encap_tbl */
        DECLARE_HASHTABLE(encap_tbl, 8);
+       struct mutex decap_tbl_lock; /* protects decap_tbl */
+       DECLARE_HASHTABLE(decap_tbl, 8);
        struct mod_hdr_tbl mod_hdr;
        DECLARE_HASHTABLE(termtbl_tbl, 8);
        struct mutex termtbl_mutex; /* protects termtbl hash */
@@ -432,6 +434,7 @@ struct mlx5_esw_flow_attr {
        struct mlx5_flow_table *fdb;
        struct mlx5_flow_table *dest_ft;
        struct mlx5_ct_attr ct_attr;
+       struct mlx5_pkt_reformat *decap_pkt_reformat;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
 };
 
index 57ac2ef52e808fdcce6757496fc4adf1fcdc2052..554fc64d8ef6a56abfe470ef2f6cafc40cf14f25 100644 (file)
@@ -366,6 +366,10 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
                        }
                }
        }
+
+       if (attr->decap_pkt_reformat)
+               flow_act.pkt_reformat = attr->decap_pkt_reformat;
+
        if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
                dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
                dest[i].counter_id = mlx5_fc_id(attr->counter);
@@ -1727,7 +1731,9 @@ static int mlx5_esw_offloads_pair(struct mlx5_eswitch *esw,
 
 static void mlx5_esw_offloads_unpair(struct mlx5_eswitch *esw)
 {
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
        mlx5e_tc_clean_fdb_peer_flows(esw);
+#endif
        esw_del_fdb_peer_miss_rules(esw);
 }
 
index 8809a65ecefbfb871f46ee579c9a10ffbaf96b55..e042e0924079396ee0287ea50efae57ac6ebf0e5 100644 (file)
@@ -144,11 +144,11 @@ static int mlx5_set_entropy(struct mlx5_tun_entropy *tun_entropy,
 int mlx5_tun_entropy_refcount_inc(struct mlx5_tun_entropy *tun_entropy,
                                  int reformat_type)
 {
-       /* the default is error for unknown (non VXLAN/GRE tunnel types) */
        int err = -EOPNOTSUPP;
 
        mutex_lock(&tun_entropy->lock);
-       if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN &&
+       if ((reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN ||
+            reformat_type == MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL) &&
            tun_entropy->enabled) {
                /* in case entropy calculation is enabled for all tunneling
                 * types, it is ok for VXLAN, so approve.
index b286fe15882063605d5208c5d15a5f275ce6ee38..51e1b3930c568f89c77f053af22723cae80445c8 100644 (file)
@@ -30,14 +30,14 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
                return -EOPNOTSUPP;
 
        act = flow_action_first_entry_get(flow_action);
-       if (act->hw_stats == FLOW_ACTION_HW_STATS_ANY ||
-           act->hw_stats == FLOW_ACTION_HW_STATS_IMMEDIATE) {
+       if (act->hw_stats & FLOW_ACTION_HW_STATS_DISABLED) {
+               /* Nothing to do */
+       } else if (act->hw_stats & FLOW_ACTION_HW_STATS_IMMEDIATE) {
                /* Count action is inserted first */
                err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack);
                if (err)
                        return err;
-       } else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED &&
-                  act->hw_stats != FLOW_ACTION_HW_STATS_DONT_CARE) {
+       } else {
                NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type");
                return -EOPNOTSUPP;
        }
index 1e0c024b0a93cd997b9f39ab3acb06ad168d544e..8e414155242336bc6f8218b865fbd989e1aa11f1 100644 (file)
@@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
        xdp->data_end = xdp->data + len;
        xdp->rxq = &nvchan->xdp_rxq;
        xdp->frame_sz = PAGE_SIZE;
-       xdp->handle = 0;
 
        memcpy(xdp->data, data, len);
 
index acd51b29a476bb0478fd03002c452a08f3a565a4..822b3acf6be74ce7586c3e23b36daa7fdf79d214 100644 (file)
 #define ATH8031_PHY_ID 0x004dd074
 #define ATH8032_PHY_ID 0x004dd023
 #define ATH8035_PHY_ID 0x004dd072
-#define AT803X_PHY_ID_MASK                     0xffffffef
+#define AT8030_PHY_ID_MASK                     0xffffffef
 
 MODULE_DESCRIPTION("Qualcomm Atheros AR803x PHY driver");
 MODULE_AUTHOR("Matus Ujhelyi");
@@ -967,9 +967,8 @@ static int at803x_cable_test_start(struct phy_device *phydev)
 static struct phy_driver at803x_driver[] = {
 {
        /* Qualcomm Atheros AR8035 */
-       .phy_id                 = ATH8035_PHY_ID,
+       PHY_ID_MATCH_EXACT(ATH8035_PHY_ID),
        .name                   = "Qualcomm Atheros AR8035",
-       .phy_id_mask            = AT803X_PHY_ID_MASK,
        .flags                  = PHY_POLL_CABLE_TEST,
        .probe                  = at803x_probe,
        .remove                 = at803x_remove,
@@ -991,7 +990,7 @@ static struct phy_driver at803x_driver[] = {
        /* Qualcomm Atheros AR8030 */
        .phy_id                 = ATH8030_PHY_ID,
        .name                   = "Qualcomm Atheros AR8030",
-       .phy_id_mask            = AT803X_PHY_ID_MASK,
+       .phy_id_mask            = AT8030_PHY_ID_MASK,
        .probe                  = at803x_probe,
        .remove                 = at803x_remove,
        .config_init            = at803x_config_init,
@@ -1005,9 +1004,8 @@ static struct phy_driver at803x_driver[] = {
        .config_intr            = at803x_config_intr,
 }, {
        /* Qualcomm Atheros AR8031/AR8033 */
-       .phy_id                 = ATH8031_PHY_ID,
+       PHY_ID_MATCH_EXACT(ATH8031_PHY_ID),
        .name                   = "Qualcomm Atheros AR8031/AR8033",
-       .phy_id_mask            = AT803X_PHY_ID_MASK,
        .flags                  = PHY_POLL_CABLE_TEST,
        .probe                  = at803x_probe,
        .remove                 = at803x_remove,
@@ -1055,10 +1053,10 @@ static struct phy_driver at803x_driver[] = {
 module_phy_driver(at803x_driver);
 
 static struct mdio_device_id __maybe_unused atheros_tbl[] = {
-       { ATH8030_PHY_ID, AT803X_PHY_ID_MASK },
-       { ATH8031_PHY_ID, AT803X_PHY_ID_MASK },
+       { ATH8030_PHY_ID, AT8030_PHY_ID_MASK },
+       { PHY_ID_MATCH_EXACT(ATH8031_PHY_ID) },
        { PHY_ID_MATCH_EXACT(ATH8032_PHY_ID) },
-       { ATH8035_PHY_ID, AT803X_PHY_ID_MASK },
+       { PHY_ID_MATCH_EXACT(ATH8035_PHY_ID) },
        { PHY_ID_MATCH_EXACT(ATH9331_PHY_ID) },
        { }
 };
index 7996a4aea8d280cb7e0b1f425ba1499a98a1574d..cfb22a21a2e6895455b1d73d12b4eaebba57698c 100644 (file)
@@ -65,7 +65,9 @@
 #define DP83869_RGMII_RX_CLK_DELAY_EN          BIT(0)
 
 /* STRAP_STS1 bits */
+#define DP83869_STRAP_OP_MODE_MASK             GENMASK(2, 0)
 #define DP83869_STRAP_STS1_RESERVED            BIT(11)
+#define DP83869_STRAP_MIRROR_ENABLED           BIT(12)
 
 /* PHYCTRL bits */
 #define DP83869_RX_FIFO_SHIFT  12
@@ -160,6 +162,20 @@ static int dp83869_config_port_mirroring(struct phy_device *phydev)
                                          DP83869_CFG3_PORT_MIRROR_EN);
 }
 
+static int dp83869_set_strapped_mode(struct phy_device *phydev)
+{
+       struct dp83869_private *dp83869 = phydev->priv;
+       int val;
+
+       val = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_STRAP_STS1);
+       if (val < 0)
+               return val;
+
+       dp83869->mode = val & DP83869_STRAP_OP_MODE_MASK;
+
+       return 0;
+}
+
 #ifdef CONFIG_OF_MDIO
 static int dp83869_of_init(struct phy_device *phydev)
 {
@@ -184,6 +200,10 @@ static int dp83869_of_init(struct phy_device *phydev)
                if (dp83869->mode < DP83869_RGMII_COPPER_ETHERNET ||
                    dp83869->mode > DP83869_SGMII_COPPER_ETHERNET)
                        return -EINVAL;
+       } else {
+               ret = dp83869_set_strapped_mode(phydev);
+               if (ret)
+                       return ret;
        }
 
        if (of_property_read_bool(of_node, "ti,max-output-impedance"))
@@ -191,10 +211,18 @@ static int dp83869_of_init(struct phy_device *phydev)
        else if (of_property_read_bool(of_node, "ti,min-output-impedance"))
                dp83869->io_impedance = DP83869_IO_MUX_CFG_IO_IMPEDANCE_MIN;
 
-       if (of_property_read_bool(of_node, "enet-phy-lane-swap"))
+       if (of_property_read_bool(of_node, "enet-phy-lane-swap")) {
                dp83869->port_mirroring = DP83869_PORT_MIRRORING_EN;
-       else
-               dp83869->port_mirroring = DP83869_PORT_MIRRORING_DIS;
+       } else {
+               /* If the lane swap is not in the DT then check the straps */
+               ret = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_STRAP_STS1);
+               if (ret < 0)
+                       return ret;
+               if (ret & DP83869_STRAP_MIRROR_ENABLED)
+                       dp83869->port_mirroring = DP83869_PORT_MIRRORING_EN;
+               else
+                       dp83869->port_mirroring = DP83869_PORT_MIRRORING_DIS;
+       }
 
        if (of_property_read_u32(of_node, "rx-fifo-depth",
                                 &dp83869->rx_fifo_depth))
@@ -209,7 +237,7 @@ static int dp83869_of_init(struct phy_device *phydev)
 #else
 static int dp83869_of_init(struct phy_device *phydev)
 {
-       return 0;
+       return dp83869_set_strapped_mode(phydev);
 }
 #endif /* CONFIG_OF_MDIO */
 
index a5b415fed11efb8c6e2dcd30554ea6c10835d0ab..3e88fbef2d4ac3a3a6d49244452321f9307767c0 100644 (file)
@@ -26,6 +26,7 @@
 #include <net/netns/generic.h>
 #include <net/tun_proto.h>
 #include <net/vxlan.h>
+#include <net/nexthop.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ip6_tunnel.h>
@@ -78,6 +79,9 @@ struct vxlan_fdb {
        u16               state;        /* see ndm_state */
        __be32            vni;
        u16               flags;        /* see ndm_flags and below */
+       struct list_head  nh_list;
+       struct nexthop __rcu *nh;
+       struct vxlan_dev  *vdev;
 };
 
 #define NTF_VXLAN_ADDED_BY_USER 0x100
@@ -174,11 +178,15 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
  */
 static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
 {
+       if (rcu_access_pointer(fdb->nh))
+               return NULL;
        return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
 }
 
 static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
 {
+       if (rcu_access_pointer(fdb->nh))
+               return NULL;
        return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
 }
 
@@ -251,9 +259,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 {
        unsigned long now = jiffies;
        struct nda_cacheinfo ci;
+       bool send_ip, send_eth;
        struct nlmsghdr *nlh;
+       struct nexthop *nh;
        struct ndmsg *ndm;
-       bool send_ip, send_eth;
 
        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
        if (nlh == NULL)
@@ -264,16 +273,21 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
        send_eth = send_ip = true;
 
+       nh = rcu_dereference_rtnl(fdb->nh);
        if (type == RTM_GETNEIGH) {
-               send_ip = !vxlan_addr_any(&rdst->remote_ip);
+               if (rdst) {
+                       send_ip = !vxlan_addr_any(&rdst->remote_ip);
+                       ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
+               } else if (nh) {
+                       ndm->ndm_family = nexthop_get_family(nh);
+               }
                send_eth = !is_zero_ether_addr(fdb->eth_addr);
-               ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
        } else
                ndm->ndm_family = AF_BRIDGE;
        ndm->ndm_state = fdb->state;
        ndm->ndm_ifindex = vxlan->dev->ifindex;
        ndm->ndm_flags = fdb->flags;
-       if (rdst->offloaded)
+       if (rdst && rdst->offloaded)
                ndm->ndm_flags |= NTF_OFFLOADED;
        ndm->ndm_type = RTN_UNICAST;
 
@@ -284,23 +298,30 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
        if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
                goto nla_put_failure;
+       if (nh) {
+               if (nla_put_u32(skb, NDA_NH_ID, nh->id))
+                       goto nla_put_failure;
+       } else if (rdst) {
+               if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
+                                                 &rdst->remote_ip))
+                       goto nla_put_failure;
+
+               if (rdst->remote_port &&
+                   rdst->remote_port != vxlan->cfg.dst_port &&
+                   nla_put_be16(skb, NDA_PORT, rdst->remote_port))
+                       goto nla_put_failure;
+               if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
+                   nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
+                       goto nla_put_failure;
+               if (rdst->remote_ifindex &&
+                   nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
+                       goto nla_put_failure;
+       }
 
-       if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
-               goto nla_put_failure;
-
-       if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
-           nla_put_be16(skb, NDA_PORT, rdst->remote_port))
-               goto nla_put_failure;
-       if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
-           nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
-               goto nla_put_failure;
        if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
            nla_put_u32(skb, NDA_SRC_VNI,
                        be32_to_cpu(fdb->vni)))
                goto nla_put_failure;
-       if (rdst->remote_ifindex &&
-           nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
-               goto nla_put_failure;
 
        ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
        ci.ndm_confirmed = 0;
@@ -401,7 +422,7 @@ static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
 {
        int err;
 
-       if (swdev_notify) {
+       if (swdev_notify && rd) {
                switch (type) {
                case RTM_NEWNEIGH:
                        err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
@@ -793,8 +814,9 @@ static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
        return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
 }
 
-static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state,
-                                        __be32 src_vni, __u16 ndm_flags)
+static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac,
+                                        __u16 state, __be32 src_vni,
+                                        __u16 ndm_flags)
 {
        struct vxlan_fdb *f;
 
@@ -805,6 +827,9 @@ static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state,
        f->flags = ndm_flags;
        f->updated = f->used = jiffies;
        f->vni = src_vni;
+       f->nh = NULL;
+       f->vdev = vxlan;
+       INIT_LIST_HEAD(&f->nh_list);
        INIT_LIST_HEAD(&f->remotes);
        memcpy(f->eth_addr, mac, ETH_ALEN);
 
@@ -819,11 +844,78 @@ static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
                           vxlan_fdb_head(vxlan, mac, src_vni));
 }
 
+static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
+                              u32 nhid, struct netlink_ext_ack *extack)
+{
+       struct nexthop *old_nh = rtnl_dereference(fdb->nh);
+       struct nh_group *nhg;
+       struct nexthop *nh;
+       int err = -EINVAL;
+
+       if (old_nh && old_nh->id == nhid)
+               return 0;
+
+       nh = nexthop_find_by_id(vxlan->net, nhid);
+       if (!nh) {
+               NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+               goto err_inval;
+       }
+
+       if (nh) {
+               if (!nexthop_get(nh)) {
+                       NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+                       nh = NULL;
+                       goto err_inval;
+               }
+               if (!nh->is_fdb_nh) {
+                       NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
+                       goto err_inval;
+               }
+
+               if (!nh->is_group || !nh->nh_grp->mpath) {
+                       NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
+                       goto err_inval;
+               }
+
+               /* check nexthop group family */
+               nhg = rtnl_dereference(nh->nh_grp);
+               switch (vxlan->default_dst.remote_ip.sa.sa_family) {
+               case AF_INET:
+                       if (!nhg->has_v4) {
+                               err = -EAFNOSUPPORT;
+                               NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
+                               goto err_inval;
+                       }
+                       break;
+               case AF_INET6:
+                       if (nhg->has_v4) {
+                               err = -EAFNOSUPPORT;
+                               NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
+                               goto err_inval;
+                       }
+               }
+       }
+
+       if (old_nh) {
+               list_del_rcu(&fdb->nh_list);
+               nexthop_put(old_nh);
+       }
+       rcu_assign_pointer(fdb->nh, nh);
+       list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
+       return 1;
+
+err_inval:
+       if (nh)
+               nexthop_put(nh);
+       return err;
+}
+
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
                            const u8 *mac, union vxlan_addr *ip,
                            __u16 state, __be16 port, __be32 src_vni,
                            __be32 vni, __u32 ifindex, __u16 ndm_flags,
-                           struct vxlan_fdb **fdb)
+                           u32 nhid, struct vxlan_fdb **fdb,
+                           struct netlink_ext_ack *extack)
 {
        struct vxlan_rdst *rd = NULL;
        struct vxlan_fdb *f;
@@ -834,24 +926,37 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
                return -ENOSPC;
 
        netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
-       f = vxlan_fdb_alloc(mac, state, src_vni, ndm_flags);
+       f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
        if (!f)
                return -ENOMEM;
 
-       rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
-       if (rc < 0) {
-               kfree(f);
-               return rc;
-       }
+       if (nhid)
+               rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
+       else
+               rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
+       if (rc < 0)
+               goto errout;
 
        *fdb = f;
 
        return 0;
+
+errout:
+       kfree(f);
+       return rc;
 }
 
 static void __vxlan_fdb_free(struct vxlan_fdb *f)
 {
        struct vxlan_rdst *rd, *nd;
+       struct nexthop *nh;
+
+       nh = rcu_dereference_raw(f->nh);
+       if (nh) {
+               rcu_assign_pointer(f->nh, NULL);
+               list_del_rcu(&f->nh_list);
+               nexthop_put(nh);
+       }
 
        list_for_each_entry_safe(rd, nd, &f->remotes, list) {
                dst_cache_destroy(&rd->dst_cache);
@@ -875,12 +980,18 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
        netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);
 
        --vxlan->addrcnt;
-       if (do_notify)
-               list_for_each_entry(rd, &f->remotes, list)
-                       vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
+       if (do_notify) {
+               if (rcu_access_pointer(f->nh))
+                       vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
                                         swdev_notify, NULL);
+               else
+                       list_for_each_entry(rd, &f->remotes, list)
+                               vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
+                                                swdev_notify, NULL);
+       }
 
        hlist_del_rcu(&f->hlist);
+       f->vdev = NULL;
        call_rcu(&f->rcu, vxlan_fdb_free);
 }
 
@@ -897,7 +1008,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
                                     __u16 state, __u16 flags,
                                     __be16 port, __be32 vni,
                                     __u32 ifindex, __u16 ndm_flags,
-                                    struct vxlan_fdb *f,
+                                    struct vxlan_fdb *f, u32 nhid,
                                     bool swdev_notify,
                                     struct netlink_ext_ack *extack)
 {
@@ -908,6 +1019,18 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
        int rc = 0;
        int err;
 
+       if (nhid && !rcu_access_pointer(f->nh)) {
+               NL_SET_ERR_MSG(extack,
+                              "Cannot replace an existing non nexthop fdb with a nexthop");
+               return -EOPNOTSUPP;
+       }
+
+       if (nhid && (flags & NLM_F_APPEND)) {
+               NL_SET_ERR_MSG(extack,
+                              "Cannot append to a nexthop fdb");
+               return -EOPNOTSUPP;
+       }
+
        /* Do not allow an externally learned entry to take over an entry added
         * by the user.
         */
@@ -929,10 +1052,17 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
                /* Only change unicasts */
                if (!(is_multicast_ether_addr(f->eth_addr) ||
                      is_zero_ether_addr(f->eth_addr))) {
-                       rc = vxlan_fdb_replace(f, ip, port, vni,
-                                              ifindex, &oldrd);
+                       if (nhid) {
+                               rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
+                               if (rc < 0)
+                                       return rc;
+                       } else {
+                               rc = vxlan_fdb_replace(f, ip, port, vni,
+                                                      ifindex, &oldrd);
+                       }
                        notify |= rc;
                } else {
+                       NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
                        return -EOPNOTSUPP;
                }
        }
@@ -962,6 +1092,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
        return 0;
 
 err_notify:
+       if (nhid)
+               return err;
        if ((flags & NLM_F_REPLACE) && rc)
                *rd = oldrd;
        else if ((flags & NLM_F_APPEND) && rc) {
@@ -975,7 +1107,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
                                   const u8 *mac, union vxlan_addr *ip,
                                   __u16 state, __u16 flags,
                                   __be16 port, __be32 src_vni, __be32 vni,
-                                  __u32 ifindex, __u16 ndm_flags,
+                                  __u32 ifindex, __u16 ndm_flags, u32 nhid,
                                   bool swdev_notify,
                                   struct netlink_ext_ack *extack)
 {
@@ -990,7 +1122,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
 
        netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
        rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
-                             vni, ifindex, fdb_flags, &f);
+                             vni, ifindex, fdb_flags, nhid, &f, extack);
        if (rc < 0)
                return rc;
 
@@ -1012,7 +1144,7 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan,
                            const u8 *mac, union vxlan_addr *ip,
                            __u16 state, __u16 flags,
                            __be16 port, __be32 src_vni, __be32 vni,
-                           __u32 ifindex, __u16 ndm_flags,
+                           __u32 ifindex, __u16 ndm_flags, u32 nhid,
                            bool swdev_notify,
                            struct netlink_ext_ack *extack)
 {
@@ -1028,14 +1160,15 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan,
 
                return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
                                                 vni, ifindex, ndm_flags, f,
-                                                swdev_notify, extack);
+                                                nhid, swdev_notify, extack);
        } else {
                if (!(flags & NLM_F_CREATE))
                        return -ENOENT;
 
                return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
                                               port, src_vni, vni, ifindex,
-                                              ndm_flags, swdev_notify, extack);
+                                              ndm_flags, nhid, swdev_notify,
+                                              extack);
        }
 }
 
@@ -1049,7 +1182,7 @@ static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
 
 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
                           union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
-                          __be32 *vni, u32 *ifindex)
+                          __be32 *vni, u32 *ifindex, u32 *nhid)
 {
        struct net *net = dev_net(vxlan->dev);
        int err;
@@ -1109,6 +1242,11 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
                *ifindex = 0;
        }
 
+       if (tb[NDA_NH_ID])
+               *nhid = nla_get_u32(tb[NDA_NH_ID]);
+       else
+               *nhid = 0;
+
        return 0;
 }
 
@@ -1123,7 +1261,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
        union vxlan_addr ip;
        __be16 port;
        __be32 src_vni, vni;
-       u32 ifindex;
+       u32 ifindex, nhid;
        u32 hash_index;
        int err;
 
@@ -1133,10 +1271,11 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
                return -EINVAL;
        }
 
-       if (tb[NDA_DST] == NULL)
+       if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
                return -EINVAL;
 
-       err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
+       err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
+                             &nhid);
        if (err)
                return err;
 
@@ -1148,7 +1287,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
        err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
                               port, src_vni, vni, ifindex,
                               ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
-                              true, extack);
+                              nhid, true, extack);
        spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
        return err;
@@ -1159,8 +1298,8 @@ static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
                              __be16 port, __be32 src_vni, __be32 vni,
                              u32 ifindex, bool swdev_notify)
 {
-       struct vxlan_fdb *f;
        struct vxlan_rdst *rd = NULL;
+       struct vxlan_fdb *f;
        int err = -ENOENT;
 
        f = vxlan_find_mac(vxlan, addr, src_vni);
@@ -1195,12 +1334,13 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
        struct vxlan_dev *vxlan = netdev_priv(dev);
        union vxlan_addr ip;
        __be32 src_vni, vni;
-       __be16 port;
-       u32 ifindex;
+       u32 ifindex, nhid;
        u32 hash_index;
+       __be16 port;
        int err;
 
-       err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
+       err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
+                             &nhid);
        if (err)
                return err;
 
@@ -1228,6 +1368,17 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
                        struct vxlan_rdst *rd;
 
+                       if (rcu_access_pointer(f->nh)) {
+                               err = vxlan_fdb_info(skb, vxlan, f,
+                                                    NETLINK_CB(cb->skb).portid,
+                                                    cb->nlh->nlmsg_seq,
+                                                    RTM_NEWNEIGH,
+                                                    NLM_F_MULTI, NULL);
+                               if (err < 0)
+                                       goto out;
+                               continue;
+                       }
+
                        list_for_each_entry_rcu(rd, &f->remotes, list) {
                                if (*idx < cb->args[2])
                                        goto skip;
@@ -1311,6 +1462,10 @@ static bool vxlan_snoop(struct net_device *dev,
                if (f->state & (NUD_PERMANENT | NUD_NOARP))
                        return true;
 
+               /* Don't override an fdb with nexthop with a learnt entry */
+               if (rcu_access_pointer(f->nh))
+                       return true;
+
                if (net_ratelimit())
                        netdev_info(dev,
                                    "%pM migrated from %pIS to %pIS\n",
@@ -1333,7 +1488,7 @@ static bool vxlan_snoop(struct net_device *dev,
                                         vxlan->cfg.dst_port,
                                         vni,
                                         vxlan->default_dst.remote_vni,
-                                        ifindex, NTF_SELF, true, NULL);
+                                        ifindex, NTF_SELF, 0, true, NULL);
                spin_unlock(&vxlan->hash_lock[hash_index]);
        }
 
@@ -2616,6 +2771,38 @@ tx_error:
        kfree_skb(skb);
 }
 
+static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
+                         struct vxlan_fdb *f, __be32 vni, bool did_rsc)
+{
+       struct vxlan_rdst nh_rdst;
+       struct nexthop *nh;
+       bool do_xmit;
+       u32 hash;
+
+       memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
+       hash = skb_get_hash(skb);
+
+       rcu_read_lock();
+       nh = rcu_dereference(f->nh);
+       if (!nh) {
+               rcu_read_unlock();
+               goto drop;
+       }
+       do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
+       rcu_read_unlock();
+
+       if (likely(do_xmit))
+               vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
+       else
+               goto drop;
+
+       return;
+
+drop:
+       dev->stats.tx_dropped++;
+       dev_kfree_skb(skb);
+}
+
 /* Transmit local packets over Vxlan
  *
  * Outer IP header inherits ECN and DF from inner header.
@@ -2692,22 +2879,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
                }
        }
 
-       list_for_each_entry_rcu(rdst, &f->remotes, list) {
-               struct sk_buff *skb1;
+       if (rcu_access_pointer(f->nh)) {
+               vxlan_xmit_nh(skb, dev, f,
+                             (vni ? : vxlan->default_dst.remote_vni), did_rsc);
+       } else {
+               list_for_each_entry_rcu(rdst, &f->remotes, list) {
+                       struct sk_buff *skb1;
 
-               if (!fdst) {
-                       fdst = rdst;
-                       continue;
+                       if (!fdst) {
+                               fdst = rdst;
+                               continue;
+                       }
+                       skb1 = skb_clone(skb, GFP_ATOMIC);
+                       if (skb1)
+                               vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
                }
-               skb1 = skb_clone(skb, GFP_ATOMIC);
-               if (skb1)
-                       vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
+               if (fdst)
+                       vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
+               else
+                       kfree_skb(skb);
        }
 
-       if (fdst)
-               vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
-       else
-               kfree_skb(skb);
        return NETDEV_TX_OK;
 }
 
@@ -3615,7 +3807,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev,
                                       dst->remote_vni,
                                       dst->remote_vni,
                                       dst->remote_ifindex,
-                                      NTF_SELF, &f);
+                                      NTF_SELF, 0, &f, extack);
                if (err)
                        return err;
        }
@@ -4013,7 +4205,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
                                               vxlan->cfg.dst_port,
                                               conf.vni, conf.vni,
                                               conf.remote_ifindex,
-                                              NTF_SELF, true, extack);
+                                              NTF_SELF, 0, true, extack);
                        if (err) {
                                spin_unlock_bh(&vxlan->hash_lock[hash_index]);
                                netdev_adjacent_change_abort(dst->remote_dev,
@@ -4335,7 +4527,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev,
                               fdb_info->remote_vni,
                               fdb_info->remote_ifindex,
                               NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
-                              false, extack);
+                              0, false, extack);
        spin_unlock_bh(&vxlan->hash_lock[hash_index]);
 
        return err;
@@ -4410,6 +4602,25 @@ static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
        .notifier_call = vxlan_switchdev_event,
 };
 
+static int vxlan_nexthop_event(struct notifier_block *nb,
+                              unsigned long event, void *ptr)
+{
+       struct nexthop *nh = ptr;
+       struct vxlan_fdb *fdb, *tmp;
+
+       if (!nh || event != NEXTHOP_EVENT_DEL)
+               return NOTIFY_DONE;
+
+       list_for_each_entry_safe(fdb, tmp, &nh->fdb_list, nh_list)
+               vxlan_fdb_destroy(fdb->vdev, fdb, false, false);
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block vxlan_nexthop_notifier_block __read_mostly = {
+       .notifier_call = vxlan_nexthop_event,
+};
+
 static __net_init int vxlan_init_net(struct net *net)
 {
        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
@@ -4421,7 +4632,7 @@ static __net_init int vxlan_init_net(struct net *net)
        for (h = 0; h < PORT_HASH_SIZE; ++h)
                INIT_HLIST_HEAD(&vn->sock_list[h]);
 
-       return 0;
+       return register_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
 }
 
 static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
@@ -4453,6 +4664,8 @@ static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
        LIST_HEAD(list);
 
        rtnl_lock();
+       list_for_each_entry(net, net_list, exit_list)
+               unregister_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
        list_for_each_entry(net, net_list, exit_list)
                vxlan_destroy_tunnels(net, &list);
 
index 272626cc3fc9dbd40aa828eeb236a150a2b9774c..c66c545e161a60a42fe72a1530cfb031fb357e5d 100644 (file)
@@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 }
 
 #define cgroup_bpf_enabled (0)
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
index cb03f6f15956e63443eefebefeb00a96b3f5c759..dc65a0d71d9b27f9f8920026319b247ec56697ec 100644 (file)
@@ -5,6 +5,7 @@
 
 #include <linux/types.h>
 #include <linux/skbuff.h>
+#include <net/rtnetlink.h>
 
 struct bareudp_conf {
        __be16 ethertype;
@@ -17,4 +18,10 @@ struct net_device *bareudp_dev_create(struct net *net, const char *name,
                                      u8 name_assign_type,
                                      struct bareudp_conf *info);
 
+static inline bool netif_is_bareudp(const struct net_device *dev)
+{
+       return dev->rtnl_link_ops &&
+              !strcmp(dev->rtnl_link_ops->kind, "bareudp");
+}
+
 #endif
index 4001ffb04f0dd05570cd4e2517267ed41271761f..95d633785ef96852a0de755d1a1405b924713e3a 100644 (file)
@@ -168,10 +168,11 @@ enum flow_action_hw_stats_bit {
        FLOW_ACTION_HW_STATS_IMMEDIATE_BIT,
        FLOW_ACTION_HW_STATS_DELAYED_BIT,
        FLOW_ACTION_HW_STATS_DISABLED_BIT,
+
+       FLOW_ACTION_HW_STATS_NUM_BITS
 };
 
 enum flow_action_hw_stats {
-       FLOW_ACTION_HW_STATS_DONT_CARE = 0,
        FLOW_ACTION_HW_STATS_IMMEDIATE =
                BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT),
        FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT),
@@ -179,6 +180,7 @@ enum flow_action_hw_stats {
                                   FLOW_ACTION_HW_STATS_DELAYED,
        FLOW_ACTION_HW_STATS_DISABLED =
                BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT),
+       FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1,
 };
 
 typedef void (*action_destr)(void *priv);
@@ -340,11 +342,12 @@ __flow_action_hw_stats_check(const struct flow_action *action,
                return false;
 
        action_entry = flow_action_first_entry_get(action);
-       if (action_entry->hw_stats == FLOW_ACTION_HW_STATS_DONT_CARE)
-               return true;
+
+       /* Zero is not a legal value for hw_stats, catch anyone passing it */
+       WARN_ON_ONCE(!action_entry->hw_stats);
 
        if (!check_allow_bit &&
-           action_entry->hw_stats != FLOW_ACTION_HW_STATS_ANY) {
+           ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) {
                NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\"");
                return false;
        } else if (check_allow_bit &&
index fdaf975e3331a0b2183cf534f294ae07cc79b736..3f615a29766e0cb1a0d9849a54058e49f2e30723 100644 (file)
@@ -65,6 +65,7 @@ struct fib6_config {
        struct nl_info  fc_nlinfo;
        struct nlattr   *fc_encap;
        u16             fc_encap_type;
+       bool            fc_is_fdb;
 };
 
 struct fib6_node {
index c712ee5eebd95daac3f34dd8e3d62e4cd9c250d1..1937476c94a0e1544a992f3c7ad5bbdf37879384 100644 (file)
@@ -14,5 +14,6 @@ struct netns_nexthop {
 
        unsigned int            seq;            /* protected by rtnl_mutex */
        u32                     last_id_allocated;
+       struct atomic_notifier_head notifier_chain;
 };
 #endif
index c440ccc861fc70b13f565dbcb138768a562a3e53..4c951680f6f982db1fe43ede8945c1946ba2acfc 100644 (file)
@@ -10,6 +10,7 @@
 #define __LINUX_NEXTHOP_H
 
 #include <linux/netdevice.h>
+#include <linux/notifier.h>
 #include <linux/route.h>
 #include <linux/types.h>
 #include <net/ip_fib.h>
@@ -26,6 +27,7 @@ struct nh_config {
        u8              nh_family;
        u8              nh_protocol;
        u8              nh_blackhole;
+       u8              nh_fdb;
        u32             nh_flags;
 
        int             nh_ifindex;
@@ -52,6 +54,7 @@ struct nh_info {
 
        u8                      family;
        bool                    reject_nh;
+       bool                    fdb_nh;
 
        union {
                struct fib_nh_common    fib_nhc;
@@ -80,6 +83,7 @@ struct nexthop {
        struct rb_node          rb_node;    /* entry on netns rbtree */
        struct list_head        fi_list;    /* v4 entries using nh */
        struct list_head        f6i_list;   /* v6 entries using nh */
+       struct list_head        fdb_list;   /* fdb entries using this nh */
        struct list_head        grp_list;   /* nh group entries using this nh */
        struct net              *net;
 
@@ -88,6 +92,7 @@ struct nexthop {
        u8                      protocol;   /* app managing this nh */
        u8                      nh_flags;
        bool                    is_group;
+       bool                    is_fdb_nh;
 
        refcount_t              refcnt;
        struct rcu_head         rcu;
@@ -98,6 +103,17 @@ struct nexthop {
        };
 };
 
+enum nexthop_event_type {
+       NEXTHOP_EVENT_ADD,
+       NEXTHOP_EVENT_DEL
+};
+
+int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
+                         enum nexthop_event_type event_type,
+                         struct nexthop *nh);
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb);
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
+
 /* caller is holding rcu or rtnl; no reference taken to nexthop */
 struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
 void nexthop_free_rcu(struct rcu_head *head);
@@ -304,4 +320,32 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
 int nexthop_for_each_fib6_nh(struct nexthop *nh,
                             int (*cb)(struct fib6_nh *nh, void *arg),
                             void *arg);
+
+static inline int nexthop_get_family(struct nexthop *nh)
+{
+       struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+       return nhi->family;
+}
+
+static inline
+struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
+{
+       struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+       return &nhi->fib_nhc;
+}
+
+static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
+                                                           int hash)
+{
+       struct nh_info *nhi;
+       struct nexthop *nhp;
+
+       nhp = nexthop_select_path(nh, hash);
+       if (unlikely(!nhp))
+               return NULL;
+       nhi = rcu_dereference(nhp->nh_info);
+       return &nhi->fib_nhc;
+}
 #endif
index ae7aeb0d1f9ca3b5c966b03a1359ab938ee966bd..db519957e134b6825a6d2939c967106ee8874b45 100644 (file)
@@ -62,7 +62,6 @@ struct switchdev_attr {
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
                u8 mrp_port_state;                      /* MRP_PORT_STATE */
                u8 mrp_port_role;                       /* MRP_PORT_ROLE */
-               u8 mrp_ring_state;                      /* MRP_RING_STATE */
 #endif
        } u;
 };
index 373aadcfea21d302b878392bcf851ea93a6c6702..3a41627cbdfe58e6a8bc134c5625278b59eb417c 100644 (file)
@@ -7,6 +7,7 @@
 #include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
+#include <net/nexthop.h>
 
 #define IANA_VXLAN_UDP_PORT     4789
 
@@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype,
 #undef VXLAN_FLAG
 }
 
+static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh,
+                                           int hash,
+                                           struct vxlan_rdst *rdst)
+{
+       struct fib_nh_common *nhc;
+
+       nhc = nexthop_path_fdb_result(nh, hash);
+       if (unlikely(!nhc))
+               return false;
+
+       switch (nhc->nhc_gw_family) {
+       case AF_INET:
+               rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4;
+               rdst->remote_ip.sa.sa_family = AF_INET;
+               break;
+       case AF_INET6:
+               rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6;
+               rdst->remote_ip.sa.sa_family = AF_INET6;
+               break;
+       }
+
+       return true;
+}
+
 #endif
index 3094fccf5a886497a1fa4d8a3b36ff297307acc9..90f11760bd122c5b93f94d917f2d12a937a34c9c 100644 (file)
@@ -39,7 +39,7 @@ enum xdp_mem_type {
        MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
        MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
        MEM_TYPE_PAGE_POOL,
-       MEM_TYPE_ZERO_COPY,
+       MEM_TYPE_XSK_BUFF_POOL,
        MEM_TYPE_MAX,
 };
 
@@ -54,10 +54,6 @@ struct xdp_mem_info {
 
 struct page_pool;
 
-struct zero_copy_allocator {
-       void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
-};
-
 struct xdp_rxq_info {
        struct net_device *dev;
        u32 queue_index;
@@ -70,7 +66,6 @@ struct xdp_buff {
        void *data_end;
        void *data_meta;
        void *data_hard_start;
-       unsigned long handle;
        struct xdp_rxq_info *rxq;
        u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
 };
@@ -119,7 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
        int metasize;
        int headroom;
 
-       if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+       if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                return xdp_convert_zc_to_xdp_frame(xdp);
 
        /* Assure headroom is available for storing info */
index abd72de25fa4ef0d90725e0e3cf7098018ce0ae8..96bfc5f5f24e34197e96debe7640ca914cbe39aa 100644 (file)
 
 struct net_device;
 struct xsk_queue;
-
-/* Masks for xdp_umem_page flags.
- * The low 12-bits of the addr will be 0 since this is the page address, so we
- * can use them for flags.
- */
-#define XSK_NEXT_PG_CONTIG_SHIFT 0
-#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
-
-struct xdp_umem_page {
-       void *addr;
-       dma_addr_t dma;
-};
-
-struct xdp_umem_fq_reuse {
-       u32 nentries;
-       u32 length;
-       u64 handles[];
-};
-
-/* Flags for the umem flags field.
- *
- * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
- * flags. See inlude/uapi/include/linux/if_xdp.h.
- */
-#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
+struct xdp_buff;
 
 struct xdp_umem {
        struct xsk_queue *fq;
        struct xsk_queue *cq;
-       struct xdp_umem_page *pages;
-       u64 chunk_mask;
+       struct xsk_buff_pool *pool;
        u64 size;
        u32 headroom;
-       u32 chunk_size_nohr;
+       u32 chunk_size;
        struct user_struct *user;
        refcount_t users;
        struct work_struct work;
@@ -59,28 +34,17 @@ struct xdp_umem {
        u8 flags;
        int id;
        struct net_device *dev;
-       struct xdp_umem_fq_reuse *fq_reuse;
        bool zc;
        spinlock_t xsk_tx_list_lock;
        struct list_head xsk_tx_list;
 };
 
-/* Nodes are linked in the struct xdp_sock map_list field, and used to
- * track which maps a certain socket reside in.
- */
-
 struct xsk_map {
        struct bpf_map map;
        spinlock_t lock; /* Synchronize map updates */
        struct xdp_sock *xsk_map[];
 };
 
-struct xsk_map_node {
-       struct list_head node;
-       struct xsk_map *map;
-       struct xdp_sock **map_entry;
-};
-
 struct xdp_sock {
        /* struct sock must be the first member of struct xdp_sock */
        struct sock sk;
@@ -111,32 +75,9 @@ struct xdp_sock {
        spinlock_t map_list_lock;
 };
 
-struct xdp_buff;
 #ifdef CONFIG_XDP_SOCKETS
-int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
-bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
-/* Used from netdev driver */
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
-void xsk_umem_release_addr(struct xdp_umem *umem);
-void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
-void xsk_umem_consume_tx_done(struct xdp_umem *umem);
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
-                                         struct xdp_umem_fq_reuse *newq);
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
-struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
-void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
-bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
 
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
-                            struct xdp_sock **map_entry);
-int xsk_map_inc(struct xsk_map *map);
-void xsk_map_put(struct xsk_map *map);
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
 
@@ -153,230 +94,13 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
        return xs;
 }
 
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
-       return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
-       return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
-       return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
-}
-
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
-       unsigned long page_addr;
-
-       addr = xsk_umem_add_offset_to_addr(addr);
-       page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
-
-       return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
-       addr = xsk_umem_add_offset_to_addr(addr);
-
-       return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
-}
-
-/* Reuse-queue aware version of FILL queue helpers */
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
-       struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-       if (rq->length >= cnt)
-               return true;
-
-       return xsk_umem_has_addrs(umem, cnt - rq->length);
-}
-
-static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
-       struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-       if (!rq->length)
-               return xsk_umem_peek_addr(umem, addr);
-
-       *addr = rq->handles[rq->length - 1];
-       return addr;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-       struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-       if (!rq->length)
-               xsk_umem_release_addr(umem);
-       else
-               rq->length--;
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-       struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
-
-       rq->handles[rq->length++] = addr;
-}
-
-/* Handle the offset appropriately depending on aligned or unaligned mode.
- * For unaligned mode, we store the offset in the upper 16-bits of the address.
- * For aligned mode, we simply add the offset to the address.
- */
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
-                                        u64 offset)
-{
-       if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
-               return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
-       else
-               return address + offset;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
-       return umem->chunk_size_nohr + umem->headroom;
-}
-
 #else
+
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
        return -ENOTSUPP;
 }
 
-static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
-{
-       return false;
-}
-
-static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
-       return false;
-}
-
-static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
-       return NULL;
-}
-
-static inline void xsk_umem_release_addr(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
-{
-}
-
-static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
-                                      struct xdp_desc *desc)
-{
-       return false;
-}
-
-static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
-{
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
-       return NULL;
-}
-
-static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
-       struct xdp_umem *umem,
-       struct xdp_umem_fq_reuse *newq)
-{
-       return NULL;
-}
-static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
-}
-
-static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
-                                                    u16 queue_id)
-{
-       return NULL;
-}
-
-static inline u64 xsk_umem_extract_addr(u64 addr)
-{
-       return 0;
-}
-
-static inline u64 xsk_umem_extract_offset(u64 addr)
-{
-       return 0;
-}
-
-static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
-{
-       return 0;
-}
-
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
-       return NULL;
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
-       return 0;
-}
-
-static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
-{
-       return false;
-}
-
-static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
-{
-       return NULL;
-}
-
-static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
-{
-}
-
-static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
-{
-}
-
-static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
-{
-       return false;
-}
-
-static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
-                                        u64 offset)
-{
-       return 0;
-}
-
-static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
-{
-       return 0;
-}
-
 static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
        return -EOPNOTSUPP;
@@ -391,6 +115,7 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
 {
        return NULL;
 }
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
new file mode 100644 (file)
index 0000000..ccf848f
--- /dev/null
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Interface for implementing AF_XDP zero-copy support in drivers.
+ * Copyright(c) 2020 Intel Corporation.
+ */
+
+#ifndef _LINUX_XDP_SOCK_DRV_H
+#define _LINUX_XDP_SOCK_DRV_H
+
+#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#ifdef CONFIG_XDP_SOCKETS
+
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
+void xsk_umem_consume_tx_done(struct xdp_umem *umem);
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+       return XDP_PACKET_HEADROOM + umem->headroom;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+       return umem->chunk_size;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+       return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem);
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+                                        struct xdp_rxq_info *rxq)
+{
+       xp_set_rxq_info(umem->pool, rxq);
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+                                     unsigned long attrs)
+{
+       xp_dma_unmap(umem->pool, attrs);
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+                                  unsigned long attrs)
+{
+       return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+       struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+       return xp_get_dma(xskb);
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+       struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+       return xp_get_frame_dma(xskb);
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+       return xp_alloc(umem->pool);
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+       return xp_can_alloc(umem->pool, count);
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+       struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+       xp_free(xskb);
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+       return xp_raw_get_dma(umem->pool, addr);
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+       return xp_raw_get_data(umem->pool, addr);
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+       struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+       xp_dma_sync_for_cpu(xskb);
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+                                                   dma_addr_t dma,
+                                                   size_t size)
+{
+       xp_dma_sync_for_device(umem->pool, dma, size);
+}
+
+#else
+
+static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+}
+
+static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
+                                      struct xdp_desc *desc)
+{
+       return false;
+}
+
+static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+}
+
+static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+                                                    u16 queue_id)
+{
+       return NULL;
+}
+
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+       return false;
+}
+
+static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+{
+       return 0;
+}
+
+static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+{
+       return 0;
+}
+
+static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+{
+       return 0;
+}
+
+static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+                                        struct xdp_rxq_info *rxq)
+{
+}
+
+static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+                                     unsigned long attrs)
+{
+}
+
+static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
+                                  unsigned long attrs)
+{
+       return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
+{
+       return 0;
+}
+
+static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
+{
+       return 0;
+}
+
+static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+{
+       return NULL;
+}
+
+static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+{
+       return false;
+}
+
+static inline void xsk_buff_free(struct xdp_buff *xdp)
+{
+}
+
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+{
+       return 0;
+}
+
+static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+{
+       return NULL;
+}
+
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+{
+}
+
+static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+                                                   dma_addr_t dma,
+                                                   size_t size)
+{
+}
+
+#endif /* CONFIG_XDP_SOCKETS */
+
+#endif /* _LINUX_XDP_SOCK_DRV_H */
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
new file mode 100644 (file)
index 0000000..a4ff226
--- /dev/null
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. */
+
+#ifndef XSK_BUFF_POOL_H_
+#define XSK_BUFF_POOL_H_
+
+#include <linux/if_xdp.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <net/xdp.h>
+
+struct xsk_buff_pool;
+struct xdp_rxq_info;
+struct xsk_queue;
+struct xdp_desc;
+struct device;
+struct page;
+
+struct xdp_buff_xsk {
+       struct xdp_buff xdp;
+       dma_addr_t dma;
+       dma_addr_t frame_dma;
+       struct xsk_buff_pool *pool;
+       bool unaligned;
+       u64 orig_addr;
+       struct list_head free_list_node;
+};
+
+struct xsk_buff_pool {
+       struct xsk_queue *fq;
+       struct list_head free_list;
+       dma_addr_t *dma_pages;
+       struct xdp_buff_xsk *heads;
+       u64 chunk_mask;
+       u64 addrs_cnt;
+       u32 free_list_cnt;
+       u32 dma_pages_cnt;
+       u32 heads_cnt;
+       u32 free_heads_cnt;
+       u32 headroom;
+       u32 chunk_size;
+       u32 frame_len;
+       bool cheap_dma;
+       bool unaligned;
+       void *addrs;
+       struct device *dev;
+       struct xdp_buff_xsk *free_heads[];
+};
+
+/* AF_XDP core. */
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+                               u32 chunk_size, u32 headroom, u64 size,
+                               bool unaligned);
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
+void xp_destroy(struct xsk_buff_pool *pool);
+void xp_release(struct xdp_buff_xsk *xskb);
+
+/* AF_XDP, and XDP core. */
+void xp_free(struct xdp_buff_xsk *xskb);
+
+/* AF_XDP ZC drivers, via xdp_sock_buff.h */
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+              unsigned long attrs, struct page **pages, u32 nr_pages);
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool);
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
+static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
+{
+       return xskb->dma;
+}
+
+static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
+{
+       return xskb->frame_dma;
+}
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
+static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
+{
+       if (xskb->pool->cheap_dma)
+               return;
+
+       xp_dma_sync_for_cpu_slow(xskb);
+}
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+                                size_t size);
+static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
+                                         dma_addr_t dma, size_t size)
+{
+       if (pool->cheap_dma)
+               return;
+
+       xp_dma_sync_for_device_slow(pool, dma, size);
+}
+
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+                                                u64 addr, u32 len)
+{
+       bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
+
+       if (pool->dma_pages_cnt && cross_pg) {
+               return !(pool->dma_pages[addr >> PAGE_SHIFT] &
+                        XSK_NEXT_PG_CONTIG_MASK);
+       }
+       return false;
+}
+
+static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
+{
+       return addr & pool->chunk_mask;
+}
+
+static inline u64 xp_unaligned_extract_addr(u64 addr)
+{
+       return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xp_unaligned_extract_offset(u64 addr)
+{
+       return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xp_unaligned_add_offset_to_addr(u64 addr)
+{
+       return xp_unaligned_extract_addr(addr) +
+               xp_unaligned_extract_offset(addr);
+}
+
+#endif /* XSK_BUFF_POOL_H_ */
index b95d65e8c62847050900b7280c16b33b445c865f..b73d3e141323f5742e7653eeef72aa391c2eb508 100644 (file)
@@ -287,7 +287,7 @@ TRACE_EVENT(xdp_devmap_xmit,
        FN(PAGE_SHARED)         \
        FN(PAGE_ORDER0)         \
        FN(PAGE_POOL)           \
-       FN(ZERO_COPY)
+       FN(XSK_BUFF_POOL)
 
 #define __MEM_TYPE_TP_FN(x)    \
        TRACE_DEFINE_ENUM(MEM_TYPE_##x);
index b9b8a0f63b9134e07fc74c483143c4b203b35d21..97e1fd19ff58ae0b1a69ea293958ba6353931ba4 100644 (file)
@@ -220,6 +220,10 @@ enum bpf_attach_type {
        BPF_MODIFY_RETURN,
        BPF_LSM_MAC,
        BPF_TRACE_ITER,
+       BPF_CGROUP_INET4_GETPEERNAME,
+       BPF_CGROUP_INET6_GETPEERNAME,
+       BPF_CGROUP_INET4_GETSOCKNAME,
+       BPF_CGROUP_INET6_GETSOCKNAME,
        __MAX_BPF_ATTACH_TYPE
 };
 
index cd144e3099a3c78768ac61abb2a7aa850a891b02..eefcda8ca44e9c780017d45b6bc7e81af3029453 100644 (file)
@@ -29,6 +29,7 @@ enum {
        NDA_LINK_NETNSID,
        NDA_SRC_VNI,
        NDA_PROTOCOL,  /* Originator of entry */
+       NDA_NH_ID,
        __NDA_MAX
 };
 
index 7b61867e98487e4d4240e59b8e9304af7d64d9b1..2d4a1e784cf04a30fed6d4d28628212c80b1dde9 100644 (file)
@@ -49,6 +49,9 @@ enum {
        NHA_GROUPS,     /* flag; only return nexthop groups in dump */
        NHA_MASTER,     /* u32;  only return nexthops with given master dev */
 
+       NHA_FDB,        /* flag; nexthop belongs to a bridge fdb */
+       /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
+
        __NHA_MAX,
 };
 
index 37b2d86201533168578ae300d346a61b0bc67c20..375b933010dd48054839f59d12c0b38a57c5f5f2 100644 (file)
@@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-ifeq ($(CONFIG_XDP_SOCKETS),y)
-obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
-endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
index 57dfc98289d5319c05330d61998ec0cc2f79c525..431241c74614339dd0ca739c33da83bbda444bbc 100644 (file)
@@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
@@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
        case BPF_CGROUP_INET6_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
@@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
        case BPF_CGROUP_INET6_POST_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_INET4_GETPEERNAME:
+       case BPF_CGROUP_INET6_GETPEERNAME:
+       case BPF_CGROUP_INET4_GETSOCKNAME:
+       case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
index 25b14ee0e26d314daacb11db589035205b99e98e..d2e27dba4ac6f032c8b08594c17e04b52caaaf6f 100644 (file)
@@ -393,6 +393,15 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
                type == PTR_TO_XDP_SOCK;
 }
 
+static bool reg_type_not_null(enum bpf_reg_type type)
+{
+       return type == PTR_TO_SOCKET ||
+               type == PTR_TO_TCP_SOCK ||
+               type == PTR_TO_MAP_VALUE ||
+               type == PTR_TO_SOCK_COMMON ||
+               type == PTR_TO_BTF_ID;
+}
+
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
        return type == PTR_TO_MAP_VALUE_OR_NULL ||
@@ -6308,8 +6317,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
 static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
                           bool is_jmp32)
 {
-       if (__is_pointer_value(false, reg))
-               return -1;
+       if (__is_pointer_value(false, reg)) {
+               if (!reg_type_not_null(reg->type))
+                       return -1;
+
+               /* If pointer is valid tests against zero will fail so we can
+                * use this to direct branch taken.
+                */
+               if (val != 0)
+                       return -1;
+
+               switch (opcode) {
+               case BPF_JEQ:
+                       return 0;
+               case BPF_JNE:
+                       return 1;
+               default:
+                       return -1;
+               }
+       }
 
        if (is_jmp32)
                return is_branch32_taken(reg, val, opcode);
@@ -6808,7 +6834,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        }
 
        if (pred >= 0) {
-               err = mark_chain_precision(env, insn->dst_reg);
+               /* If we get here with a dst_reg pointer type it is because
+                * above is_branch_taken() special cased the 0 comparison.
+                */
+               if (!__is_pointer_value(false, dst_reg))
+                       err = mark_chain_precision(env, insn->dst_reg);
                if (BPF_SRC(insn->code) == BPF_X && !err)
                        err = mark_chain_precision(env, insn->src_reg);
                if (err)
@@ -7094,7 +7124,11 @@ static int check_return_code(struct bpf_verifier_env *env)
        switch (env->prog->type) {
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
                if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
-                   env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+                   env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+                   env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
                        range = tnum_range(1, 1);
                break;
        case BPF_PROG_TYPE_CGROUP_SKB:
@@ -7120,10 +7154,11 @@ static int check_return_code(struct bpf_verifier_env *env)
                case BPF_TRACE_FEXIT:
                        range = tnum_const(0);
                        break;
-               case BPF_TRACE_ITER:
                case BPF_TRACE_RAW_TP:
                case BPF_MODIFY_RETURN:
                        return 0;
+               case BPF_TRACE_ITER:
+                       break;
                default:
                        return -ENOTSUPP;
                }
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
deleted file mode 100644 (file)
index 2cc5c8f..0000000
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* XSKMAP used for AF_XDP sockets
- * Copyright(c) 2018 Intel Corporation.
- */
-
-#include <linux/bpf.h>
-#include <linux/capability.h>
-#include <net/xdp_sock.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-int xsk_map_inc(struct xsk_map *map)
-{
-       bpf_map_inc(&map->map);
-       return 0;
-}
-
-void xsk_map_put(struct xsk_map *map)
-{
-       bpf_map_put(&map->map);
-}
-
-static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
-                                              struct xdp_sock **map_entry)
-{
-       struct xsk_map_node *node;
-       int err;
-
-       node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
-       if (!node)
-               return ERR_PTR(-ENOMEM);
-
-       err = xsk_map_inc(map);
-       if (err) {
-               kfree(node);
-               return ERR_PTR(err);
-       }
-
-       node->map = map;
-       node->map_entry = map_entry;
-       return node;
-}
-
-static void xsk_map_node_free(struct xsk_map_node *node)
-{
-       xsk_map_put(node->map);
-       kfree(node);
-}
-
-static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
-{
-       spin_lock_bh(&xs->map_list_lock);
-       list_add_tail(&node->node, &xs->map_list);
-       spin_unlock_bh(&xs->map_list_lock);
-}
-
-static void xsk_map_sock_delete(struct xdp_sock *xs,
-                               struct xdp_sock **map_entry)
-{
-       struct xsk_map_node *n, *tmp;
-
-       spin_lock_bh(&xs->map_list_lock);
-       list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
-               if (map_entry == n->map_entry) {
-                       list_del(&n->node);
-                       xsk_map_node_free(n);
-               }
-       }
-       spin_unlock_bh(&xs->map_list_lock);
-}
-
-static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
-{
-       struct bpf_map_memory mem;
-       int err, numa_node;
-       struct xsk_map *m;
-       u64 size;
-
-       if (!capable(CAP_NET_ADMIN))
-               return ERR_PTR(-EPERM);
-
-       if (attr->max_entries == 0 || attr->key_size != 4 ||
-           attr->value_size != 4 ||
-           attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
-               return ERR_PTR(-EINVAL);
-
-       numa_node = bpf_map_attr_numa_node(attr);
-       size = struct_size(m, xsk_map, attr->max_entries);
-
-       err = bpf_map_charge_init(&mem, size);
-       if (err < 0)
-               return ERR_PTR(err);
-
-       m = bpf_map_area_alloc(size, numa_node);
-       if (!m) {
-               bpf_map_charge_finish(&mem);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       bpf_map_init_from_attr(&m->map, attr);
-       bpf_map_charge_move(&m->map.memory, &mem);
-       spin_lock_init(&m->lock);
-
-       return &m->map;
-}
-
-static void xsk_map_free(struct bpf_map *map)
-{
-       struct xsk_map *m = container_of(map, struct xsk_map, map);
-
-       bpf_clear_redirect_map(map);
-       synchronize_net();
-       bpf_map_area_free(m);
-}
-
-static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
-       struct xsk_map *m = container_of(map, struct xsk_map, map);
-       u32 index = key ? *(u32 *)key : U32_MAX;
-       u32 *next = next_key;
-
-       if (index >= m->map.max_entries) {
-               *next = 0;
-               return 0;
-       }
-
-       if (index == m->map.max_entries - 1)
-               return -ENOENT;
-       *next = index + 1;
-       return 0;
-}
-
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
-{
-       const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
-       struct bpf_insn *insn = insn_buf;
-
-       *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-       *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
-       *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
-       *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
-       *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
-       *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
-       *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
-       *insn++ = BPF_MOV64_IMM(ret, 0);
-       return insn - insn_buf;
-}
-
-static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-       return __xsk_map_lookup_elem(map, *(u32 *)key);
-}
-
-static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
-{
-       return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
-                              u64 map_flags)
-{
-       struct xsk_map *m = container_of(map, struct xsk_map, map);
-       struct xdp_sock *xs, *old_xs, **map_entry;
-       u32 i = *(u32 *)key, fd = *(u32 *)value;
-       struct xsk_map_node *node;
-       struct socket *sock;
-       int err;
-
-       if (unlikely(map_flags > BPF_EXIST))
-               return -EINVAL;
-       if (unlikely(i >= m->map.max_entries))
-               return -E2BIG;
-
-       sock = sockfd_lookup(fd, &err);
-       if (!sock)
-               return err;
-
-       if (sock->sk->sk_family != PF_XDP) {
-               sockfd_put(sock);
-               return -EOPNOTSUPP;
-       }
-
-       xs = (struct xdp_sock *)sock->sk;
-
-       if (!xsk_is_setup_for_bpf_map(xs)) {
-               sockfd_put(sock);
-               return -EOPNOTSUPP;
-       }
-
-       map_entry = &m->xsk_map[i];
-       node = xsk_map_node_alloc(m, map_entry);
-       if (IS_ERR(node)) {
-               sockfd_put(sock);
-               return PTR_ERR(node);
-       }
-
-       spin_lock_bh(&m->lock);
-       old_xs = READ_ONCE(*map_entry);
-       if (old_xs == xs) {
-               err = 0;
-               goto out;
-       } else if (old_xs && map_flags == BPF_NOEXIST) {
-               err = -EEXIST;
-               goto out;
-       } else if (!old_xs && map_flags == BPF_EXIST) {
-               err = -ENOENT;
-               goto out;
-       }
-       xsk_map_sock_add(xs, node);
-       WRITE_ONCE(*map_entry, xs);
-       if (old_xs)
-               xsk_map_sock_delete(old_xs, map_entry);
-       spin_unlock_bh(&m->lock);
-       sockfd_put(sock);
-       return 0;
-
-out:
-       spin_unlock_bh(&m->lock);
-       sockfd_put(sock);
-       xsk_map_node_free(node);
-       return err;
-}
-
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
-{
-       struct xsk_map *m = container_of(map, struct xsk_map, map);
-       struct xdp_sock *old_xs, **map_entry;
-       int k = *(u32 *)key;
-
-       if (k >= map->max_entries)
-               return -EINVAL;
-
-       spin_lock_bh(&m->lock);
-       map_entry = &m->xsk_map[k];
-       old_xs = xchg(map_entry, NULL);
-       if (old_xs)
-               xsk_map_sock_delete(old_xs, map_entry);
-       spin_unlock_bh(&m->lock);
-
-       return 0;
-}
-
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
-                            struct xdp_sock **map_entry)
-{
-       spin_lock_bh(&map->lock);
-       if (READ_ONCE(*map_entry) == xs) {
-               WRITE_ONCE(*map_entry, NULL);
-               xsk_map_sock_delete(xs, map_entry);
-       }
-       spin_unlock_bh(&map->lock);
-}
-
-const struct bpf_map_ops xsk_map_ops = {
-       .map_alloc = xsk_map_alloc,
-       .map_free = xsk_map_free,
-       .map_get_next_key = xsk_map_get_next_key,
-       .map_lookup_elem = xsk_map_lookup_elem,
-       .map_gen_lookup = xsk_map_gen_lookup,
-       .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
-       .map_update_elem = xsk_map_update_elem,
-       .map_delete_elem = xsk_map_delete_elem,
-       .map_check_btf = map_check_no_btf,
-};
index 30ba7d38941d91fb12147ff20e7010c0aba549d9..bfd4ccd80847de61b44462df7b70f46dfb382acd 100644 (file)
@@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
                           u32 headroom, u32 tailroom)
 {
        void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+       u32 user_size = kattr->test.data_size_in;
        void *data;
 
        if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
                return ERR_PTR(-EINVAL);
 
+       if (user_size > size)
+               return ERR_PTR(-EMSGSIZE);
+
        data = kzalloc(size + headroom + tailroom, GFP_USER);
        if (!data)
                return ERR_PTR(-ENOMEM);
 
-       if (copy_from_user(data + headroom, data_in, size)) {
+       if (copy_from_user(data + headroom, data_in, user_size)) {
                kfree(data);
                return ERR_PTR(-EFAULT);
        }
@@ -486,8 +490,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
        /* XDP have extra tailroom as (most) drivers use full page */
        max_data_sz = 4096 - headroom - tailroom;
-       if (size > max_data_sz)
-               return -EINVAL;
 
        data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
        if (IS_ERR(data))
index d7bc09de4c13901ef7dc72ef6eb507ae4e5c8766..528d767eb026fc85c33b24243bd41cb226176d96 100644 (file)
@@ -37,6 +37,26 @@ static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id)
        return res;
 }
 
+static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex)
+{
+       struct br_mrp *mrp;
+
+       list_for_each_entry_rcu(mrp, &br->mrp_list, list,
+                               lockdep_rtnl_is_held()) {
+               struct net_bridge_port *p;
+
+               p = rtnl_dereference(mrp->p_port);
+               if (p && p->dev->ifindex == ifindex)
+                       return false;
+
+               p = rtnl_dereference(mrp->s_port);
+               if (p && p->dev->ifindex == ifindex)
+                       return false;
+       }
+
+       return true;
+}
+
 static struct br_mrp *br_mrp_find_port(struct net_bridge *br,
                                       struct net_bridge_port *p)
 {
@@ -203,6 +223,7 @@ out:
 static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
 {
        struct net_bridge_port *p;
+       u8 state;
 
        /* Stop sending MRP_Test frames */
        cancel_delayed_work_sync(&mrp->test_work);
@@ -214,20 +235,24 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
        p = rtnl_dereference(mrp->p_port);
        if (p) {
                spin_lock_bh(&br->lock);
-               p->state = BR_STATE_FORWARDING;
+               state = netif_running(br->dev) ?
+                               BR_STATE_FORWARDING : BR_STATE_DISABLED;
+               p->state = state;
                p->flags &= ~BR_MRP_AWARE;
                spin_unlock_bh(&br->lock);
-               br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+               br_mrp_port_switchdev_set_state(p, state);
                rcu_assign_pointer(mrp->p_port, NULL);
        }
 
        p = rtnl_dereference(mrp->s_port);
        if (p) {
                spin_lock_bh(&br->lock);
-               p->state = BR_STATE_FORWARDING;
+               state = netif_running(br->dev) ?
+                               BR_STATE_FORWARDING : BR_STATE_DISABLED;
+               p->state = state;
                p->flags &= ~BR_MRP_AWARE;
                spin_unlock_bh(&br->lock);
-               br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING);
+               br_mrp_port_switchdev_set_state(p, state);
                rcu_assign_pointer(mrp->s_port, NULL);
        }
 
@@ -255,6 +280,11 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance)
            !br_mrp_get_port(br, instance->s_ifindex))
                return -EINVAL;
 
+       /* It is not possible to have the same port part of multiple rings */
+       if (!br_mrp_unique_ifindex(br, instance->p_ifindex) ||
+           !br_mrp_unique_ifindex(br, instance->s_ifindex))
+               return -EINVAL;
+
        mrp = kzalloc(sizeof(*mrp), GFP_KERNEL);
        if (!mrp)
                return -ENOMEM;
index 822d662f97ef93482d2708f9623d30400909d523..bd2853d23b505e34cc8aeb61b9500cfdd590aae2 100644 (file)
@@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size,
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET4_CONNECT:
+               case BPF_CGROUP_INET4_GETPEERNAME:
+               case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                        break;
@@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size,
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_INET6_GETPEERNAME:
+               case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                        break;
index e951b743bed3156ed104ab39e397c1780dce2a83..e64941c526b1e754546b990db0cbd8d953574e01 100644 (file)
@@ -8,6 +8,7 @@
 struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 {
        struct flow_rule *rule;
+       int i;
 
        rule = kzalloc(struct_size(rule, action.entries, num_actions),
                       GFP_KERNEL);
@@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions)
                return NULL;
 
        rule->action.num_entries = num_actions;
+       /* Pre-fill each action hw_stats with DONT_CARE.
+        * Caller can override this if it wants stats for a given action.
+        */
+       for (i = 0; i < num_actions; i++)
+               rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
 
        return rule;
 }
index b607ea602774b64535db1a11712694c05bd72019..37e4dba624601f957f89ff9848e05b56795569bd 100644 (file)
@@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family)
 }
 
 const struct nla_policy nda_policy[NDA_MAX+1] = {
+       [NDA_UNSPEC]            = { .strict_start_type = NDA_NH_ID },
        [NDA_DST]               = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [NDA_LLADDR]            = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [NDA_CACHEINFO]         = { .len = sizeof(struct nda_cacheinfo) },
@@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = {
        [NDA_IFINDEX]           = { .type = NLA_U32 },
        [NDA_MASTER]            = { .type = NLA_U32 },
        [NDA_PROTOCOL]          = { .type = NLA_U8 },
+       [NDA_NH_ID]             = { .type = NLA_U32 },
 };
 
 static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
index 490b8f5fa8ee2248e91fdf6b8c92e6870a6e29f0..90f44f38211585e14c4d04ecfd645fff98ae507e 100644 (file)
@@ -17,6 +17,7 @@
 #include <net/xdp.h>
 #include <net/xdp_priv.h> /* struct xdp_mem_allocator */
 #include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
 
 #define REG_STATE_NEW          0x0
 #define REG_STATE_REGISTERED   0x1
@@ -109,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator)
        mutex_unlock(&mem_id_lock);
 }
 
-static void mem_id_disconnect(int id)
-{
-       struct xdp_mem_allocator *xa;
-
-       mutex_lock(&mem_id_lock);
-
-       xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
-       if (!xa) {
-               mutex_unlock(&mem_id_lock);
-               WARN(1, "Request remove non-existing id(%d), driver bug?", id);
-               return;
-       }
-
-       trace_mem_disconnect(xa);
-
-       if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
-               call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
-
-       mutex_unlock(&mem_id_lock);
-}
-
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 {
        struct xdp_mem_allocator *xa;
@@ -143,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
        if (id == 0)
                return;
 
-       if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
-               return mem_id_disconnect(id);
-
        if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
                rcu_read_lock();
                xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
@@ -301,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
        xdp_rxq->mem.type = type;
 
        if (!allocator) {
-               if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+               if (type == MEM_TYPE_PAGE_POOL)
                        return -EINVAL; /* Setup time check page_pool req */
                return 0;
        }
@@ -358,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * scenarios (e.g. queue full), it is possible to return the xdp_frame
  * while still leveraging this protection.  The @napi_direct boolean
  * is used for those calls sites.  Thus, allowing for faster recycling
- * of xdp_frames/pages in those cases.
+ * of xdp_frames/pages in those cases. This path is never used by the
+ * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of
+ * the switch-statement.
  */
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
-                        unsigned long handle)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 {
        struct xdp_mem_allocator *xa;
        struct page *page;
@@ -383,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
                page = virt_to_page(data); /* Assumes order0 page*/
                put_page(page);
                break;
-       case MEM_TYPE_ZERO_COPY:
-               /* NB! Only valid from an xdp_buff! */
-               rcu_read_lock();
-               /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
-               xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-               xa->zc_alloc->free(xa->zc_alloc, handle);
-               rcu_read_unlock();
        default:
                /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+               WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
                break;
        }
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-       __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+       __xdp_return(xdpf->data, &xdpf->mem, false);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
-       __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+       __xdp_return(xdpf->data, &xdpf->mem, true);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-       __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+       __xdp_return(xdp->data, &xdp->rxq->mem, true);
 }
-EXPORT_SYMBOL_GPL(xdp_return_buff);
 
 /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
 void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
@@ -493,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
        xdpf->metasize = metasize;
        xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
 
-       xdp_return_buff(xdp);
+       xsk_buff_free(xdp);
        return xdpf;
 }
 EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
index 3aa4975919d7c86cab8ae02b082224b6cace1c2a..9ef54cdcf662773241a057c844549469ab9a64dd 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 #include "netlink.h"
 #include "common.h"
index eeb1137a3f230a8e420518d3d18c2d83c651d09b..31e0b4e88a9d9fde48230542490b47b253ea7e98 100644 (file)
@@ -24,7 +24,7 @@
 #include <linux/sched/signal.h>
 #include <linux/net.h>
 #include <net/devlink.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/flow_offload.h>
 #include <linux/ethtool_netlink.h>
 #include <generated/utsrelease.h>
index c35a8b2e0499e3f70d531246065aa5555bf23a0e..02aa5cb3a4fd15fa219831283dea2992b053c6a8 100644 (file)
@@ -756,12 +756,11 @@ do_err:
 }
 EXPORT_SYMBOL(inet_accept);
 
-
 /*
  *     This does both peername and sockname.
  */
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
-                       int peer)
+                int peer)
 {
        struct sock *sk         = sock->sk;
        struct inet_sock *inet  = inet_sk(sk);
@@ -782,6 +781,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                sin->sin_port = inet->inet_sport;
                sin->sin_addr.s_addr = addr;
        }
+       if (cgroup_bpf_enabled)
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           peer ? BPF_CGROUP_INET4_GETPEERNAME :
+                                                  BPF_CGROUP_INET4_GETSOCKNAME,
+                                           NULL);
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
        return sizeof(*sin);
 }
index 3957364d556cd73d158af2cc25dee2545d4491f9..c337e73e02dddb28fd743a57b9eb33fc23fc968c 100644 (file)
@@ -33,8 +33,20 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
        [NHA_ENCAP]             = { .type = NLA_NESTED },
        [NHA_GROUPS]            = { .type = NLA_FLAG },
        [NHA_MASTER]            = { .type = NLA_U32 },
+       [NHA_FDB]               = { .type = NLA_FLAG },
 };
 
+static int call_nexthop_notifiers(struct net *net,
+                                 enum fib_event_type event_type,
+                                 struct nexthop *nh)
+{
+       int err;
+
+       err = atomic_notifier_call_chain(&net->nexthop.notifier_chain,
+                                        event_type, nh);
+       return notifier_to_errno(err);
+}
+
 static unsigned int nh_dev_hashfn(unsigned int val)
 {
        unsigned int mask = NH_DEV_HASHSIZE - 1;
@@ -107,6 +119,7 @@ static struct nexthop *nexthop_alloc(void)
                INIT_LIST_HEAD(&nh->fi_list);
                INIT_LIST_HEAD(&nh->f6i_list);
                INIT_LIST_HEAD(&nh->grp_list);
+               INIT_LIST_HEAD(&nh->fdb_list);
        }
        return nh;
 }
@@ -227,6 +240,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
        if (nla_put_u32(skb, NHA_ID, nh->id))
                goto nla_put_failure;
 
+       if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB))
+               goto nla_put_failure;
+
        if (nh->is_group) {
                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 
@@ -241,7 +257,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
                if (nla_put_flag(skb, NHA_BLACKHOLE))
                        goto nla_put_failure;
                goto out;
-       } else {
+       } else if (!nh->is_fdb_nh) {
                const struct net_device *dev;
 
                dev = nhi->fib_nhc.nhc_dev;
@@ -387,12 +403,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
        return true;
 }
 
+static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
+                                  struct netlink_ext_ack *extack)
+{
+       struct nh_info *nhi;
+
+       if (!nh->is_fdb_nh) {
+               NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
+               return -EINVAL;
+       }
+
+       nhi = rtnl_dereference(nh->nh_info);
+       if (*nh_family == AF_UNSPEC) {
+               *nh_family = nhi->family;
+       } else if (*nh_family != nhi->family) {
+               NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
 static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
                               struct netlink_ext_ack *extack)
 {
        unsigned int len = nla_len(tb[NHA_GROUP]);
+       u8 nh_family = AF_UNSPEC;
        struct nexthop_grp *nhg;
        unsigned int i, j;
+       u8 nhg_fdb = 0;
 
        if (len & (sizeof(struct nexthop_grp) - 1)) {
                NL_SET_ERR_MSG(extack,
@@ -421,6 +460,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
                }
        }
 
+       if (tb[NHA_FDB])
+               nhg_fdb = 1;
        nhg = nla_data(tb[NHA_GROUP]);
        for (i = 0; i < len; ++i) {
                struct nexthop *nh;
@@ -432,11 +473,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
                }
                if (!valid_group_nh(nh, len, extack))
                        return -EINVAL;
+
+               if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
+                       return -EINVAL;
+
+               if (!nhg_fdb && nh->is_fdb_nh) {
+                       NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
+                       return -EINVAL;
+               }
        }
        for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
                if (!tb[i])
                        continue;
-
+               if (tb[NHA_FDB])
+                       continue;
                NL_SET_ERR_MSG(extack,
                               "No other attributes can be set in nexthop groups");
                return -EINVAL;
@@ -495,6 +545,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
                if (hash > atomic_read(&nhge->upper_bound))
                        continue;
 
+               if (nhge->nh->is_fdb_nh)
+                       return nhge->nh;
+
                /* nexthops always check if it is good and does
                 * not rely on a sysctl for this behavior
                 */
@@ -564,6 +617,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
 {
        struct nh_info *nhi;
 
+       if (nh->is_fdb_nh) {
+               NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+               return -EINVAL;
+       }
+
        /* fib6_src is unique to a fib6_info and limits the ability to cache
         * routes in fib6_nh within a nexthop that is potentially shared
         * across multiple fib entries. If the config wants to use source
@@ -640,6 +698,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope,
 {
        int err = 0;
 
+       if (nh->is_fdb_nh) {
+               NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+               err = -EINVAL;
+               goto out;
+       }
+
        if (nh->is_group) {
                struct nh_group *nhg;
 
@@ -773,6 +837,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
        bool do_flush = false;
        struct fib_info *fi;
 
+       call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh);
+
        list_for_each_entry(fi, &nh->fi_list, nh_list) {
                fi->fib_flags |= RTNH_F_DEAD;
                do_flush = true;
@@ -1125,6 +1191,9 @@ static struct nexthop *nexthop_create_group(struct net *net,
                nh_group_rebalance(nhg);
        }
 
+       if (cfg->nh_fdb)
+               nh->is_fdb_nh = 1;
+
        rcu_assign_pointer(nh->nh_grp, nhg);
 
        return nh;
@@ -1152,7 +1221,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
                .fc_encap = cfg->nh_encap,
                .fc_encap_type = cfg->nh_encap_type,
        };
-       u32 tb_id = l3mdev_fib_table(cfg->dev);
+       u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
        int err;
 
        err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
@@ -1161,6 +1230,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
                goto out;
        }
 
+       if (nh->is_fdb_nh)
+               goto out;
+
        /* sets nh_dev if successful */
        err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
        if (!err) {
@@ -1186,6 +1258,7 @@ static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
                .fc_flags = cfg->nh_flags,
                .fc_encap = cfg->nh_encap,
                .fc_encap_type = cfg->nh_encap_type,
+               .fc_is_fdb = cfg->nh_fdb,
        };
        int err;
 
@@ -1227,6 +1300,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
        nhi->family = cfg->nh_family;
        nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
 
+       if (cfg->nh_fdb)
+               nh->is_fdb_nh = 1;
+
        if (cfg->nh_blackhole) {
                nhi->reject_nh = 1;
                cfg->nh_ifindex = net->loopback_dev->ifindex;
@@ -1248,7 +1324,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
        }
 
        /* add the entry to the device based hash */
-       nexthop_devhash_add(net, nhi);
+       if (!nh->is_fdb_nh)
+               nexthop_devhash_add(net, nhi);
 
        rcu_assign_pointer(nh->nh_info, nhi);
 
@@ -1352,6 +1429,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
        if (tb[NHA_ID])
                cfg->nh_id = nla_get_u32(tb[NHA_ID]);
 
+       if (tb[NHA_FDB]) {
+               if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
+                   tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
+                       NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
+                       goto out;
+               }
+               if (nhm->nh_flags) {
+                       NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
+                       goto out;
+               }
+               cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
+       }
+
        if (tb[NHA_GROUP]) {
                if (nhm->nh_family != AF_UNSPEC) {
                        NL_SET_ERR_MSG(extack, "Invalid family for group");
@@ -1375,8 +1465,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 
        if (tb[NHA_BLACKHOLE]) {
                if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
-                   tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
-                       NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
+                   tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
+                       NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
                        goto out;
                }
 
@@ -1385,26 +1475,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
                goto out;
        }
 
-       if (!tb[NHA_OIF]) {
-               NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
+       if (!cfg->nh_fdb && !tb[NHA_OIF]) {
+               NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
                goto out;
        }
 
-       cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
-       if (cfg->nh_ifindex)
-               cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+       if (!cfg->nh_fdb && tb[NHA_OIF]) {
+               cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+               if (cfg->nh_ifindex)
+                       cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
 
-       if (!cfg->dev) {
-               NL_SET_ERR_MSG(extack, "Invalid device index");
-               goto out;
-       } else if (!(cfg->dev->flags & IFF_UP)) {
-               NL_SET_ERR_MSG(extack, "Nexthop device is not up");
-               err = -ENETDOWN;
-               goto out;
-       } else if (!netif_carrier_ok(cfg->dev)) {
-               NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
-               err = -ENETDOWN;
-               goto out;
+               if (!cfg->dev) {
+                       NL_SET_ERR_MSG(extack, "Invalid device index");
+                       goto out;
+               } else if (!(cfg->dev->flags & IFF_UP)) {
+                       NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+                       err = -ENETDOWN;
+                       goto out;
+               } else if (!netif_carrier_ok(cfg->dev)) {
+                       NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+                       err = -ENETDOWN;
+                       goto out;
+               }
        }
 
        err = -EINVAL;
@@ -1633,7 +1725,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
 
 static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
                             int *master_idx, bool *group_filter,
-                            struct netlink_callback *cb)
+                            bool *fdb_filter, struct netlink_callback *cb)
 {
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[NHA_MAX + 1];
@@ -1670,6 +1762,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
                case NHA_GROUPS:
                        *group_filter = true;
                        break;
+               case NHA_FDB:
+                       *fdb_filter = true;
+                       break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
                        return -EINVAL;
@@ -1688,17 +1783,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
 /* rtnl */
 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 {
+       bool group_filter = false, fdb_filter = false;
        struct nhmsg *nhm = nlmsg_data(cb->nlh);
        int dev_filter_idx = 0, master_idx = 0;
        struct net *net = sock_net(skb->sk);
        struct rb_root *root = &net->nexthop.rb_root;
-       bool group_filter = false;
        struct rb_node *node;
        int idx = 0, s_idx;
        int err;
 
        err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
-                               &group_filter, cb);
+                               &group_filter, &fdb_filter, cb);
        if (err < 0)
                return err;
 
@@ -1783,6 +1878,19 @@ static struct notifier_block nh_netdev_notifier = {
        .notifier_call = nh_netdev_event,
 };
 
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+       return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb);
+}
+EXPORT_SYMBOL(register_nexthop_notifier);
+
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+       return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain,
+                                               nb);
+}
+EXPORT_SYMBOL(unregister_nexthop_notifier);
+
 static void __net_exit nexthop_net_exit(struct net *net)
 {
        rtnl_lock();
@@ -1799,6 +1907,7 @@ static int __net_init nexthop_net_init(struct net *net)
        net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
        if (!net->nexthop.devhash)
                return -ENOMEM;
+       ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
 
        return 0;
 }
index b69496eaf9226e112dd3cbd41fec736f10863faf..0625a97a8894f270fbfb62d99178dcce66fa58fd 100644 (file)
@@ -505,9 +505,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock);
 /*
  *     This does both peername and sockname.
  */
-
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
-                int peer)
+                 int peer)
 {
        struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
        struct sock *sk = sock->sk;
@@ -532,9 +531,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                        sin->sin6_addr = np->saddr;
                else
                        sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
                sin->sin6_port = inet->inet_sport;
        }
+       if (cgroup_bpf_enabled)
+               BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+                                           peer ? BPF_CGROUP_INET6_GETPEERNAME :
+                                                  BPF_CGROUP_INET6_GETSOCKNAME,
+                                           NULL);
        sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
                                                 sk->sk_bound_dev_if);
        return sizeof(*sin);
index 4703b09808d0af016ae566a5be5cb490c7179d38..821d96c720b936ae732c388c02977eef213291bb 100644 (file)
@@ -89,6 +89,11 @@ struct ip6_tnl_net {
        struct ip6_tnl __rcu *collect_md_tun;
 };
 
+static inline int ip6_tnl_mpls_supported(void)
+{
+       return IS_ENABLED(CONFIG_MPLS);
+}
+
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
 {
        struct pcpu_sw_netstats tmp, sum = { 0 };
@@ -718,6 +723,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        return 0;
 }
 
+static int
+mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+           u8 type, u8 code, int offset, __be32 info)
+{
+       __u32 rel_info = ntohl(info);
+       int err, rel_msg = 0;
+       u8 rel_type = type;
+       u8 rel_code = code;
+
+       err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code,
+                         &rel_msg, &rel_info, offset);
+       return err;
+}
+
 static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
                                       const struct ipv6hdr *ipv6h,
                                       struct sk_buff *skb)
@@ -740,6 +759,14 @@ static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
        return IP6_ECN_decapsulate(ipv6h, skb);
 }
 
+static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+                                              const struct ipv6hdr *ipv6h,
+                                              struct sk_buff *skb)
+{
+       /* ECN is not supported in AF_MPLS */
+       return 0;
+}
+
 __u32 ip6_tnl_get_cap(struct ip6_tnl *t,
                             const struct in6_addr *laddr,
                             const struct in6_addr *raddr)
@@ -901,6 +928,11 @@ static const struct tnl_ptk_info tpi_v4 = {
        .proto = htons(ETH_P_IP),
 };
 
+static const struct tnl_ptk_info tpi_mpls = {
+       /* no tunnel info required for mplsip6. */
+       .proto = htons(ETH_P_MPLS_UC),
+};
+
 static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
                      const struct tnl_ptk_info *tpi,
                      int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
@@ -958,6 +990,12 @@ static int ip6ip6_rcv(struct sk_buff *skb)
                          ip6ip6_dscp_ecn_decapsulate);
 }
 
+static int mplsip6_rcv(struct sk_buff *skb)
+{
+       return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls,
+                         mplsip6_dscp_ecn_decapsulate);
+}
+
 struct ipv6_tel_txoption {
        struct ipv6_txoptions ops;
        __u8 dst_opt[8];
@@ -1232,6 +1270,8 @@ route_lookup:
                ipv6_push_frag_opts(skb, &opt.ops, &proto);
        }
 
+       skb_set_inner_ipproto(skb, proto);
+
        skb_push(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        ipv6h = ipv6_hdr(skb);
@@ -1253,22 +1293,22 @@ tx_err_dst_release:
 EXPORT_SYMBOL(ip6_tnl_xmit);
 
 static inline int
-ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
+               u8 protocol)
 {
        struct ip6_tnl *t = netdev_priv(dev);
+       struct ipv6hdr *ipv6h;
        const struct iphdr  *iph;
        int encap_limit = -1;
+       __u16 offset;
        struct flowi6 fl6;
-       __u8 dsfield;
+       __u8 dsfield, orig_dsfield;
        __u32 mtu;
        u8 tproto;
        int err;
 
-       iph = ip_hdr(skb);
-       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
        tproto = READ_ONCE(t->parms.proto);
-       if (tproto != IPPROTO_IPIP && tproto != 0)
+       if (tproto != protocol && tproto != 0)
                return -1;
 
        if (t->parms.collect_md) {
@@ -1281,129 +1321,100 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
                        return -1;
                key = &tun_info->key;
                memset(&fl6, 0, sizeof(fl6));
-               fl6.flowi6_proto = IPPROTO_IPIP;
+               fl6.flowi6_proto = protocol;
                fl6.saddr = key->u.ipv6.src;
                fl6.daddr = key->u.ipv6.dst;
                fl6.flowlabel = key->label;
                dsfield =  key->tos;
+               switch (protocol) {
+               case IPPROTO_IPIP:
+                       iph = ip_hdr(skb);
+                       orig_dsfield = ipv4_get_dsfield(iph);
+                       break;
+               case IPPROTO_IPV6:
+                       ipv6h = ipv6_hdr(skb);
+                       orig_dsfield = ipv6_get_dsfield(ipv6h);
+                       break;
+               default:
+                       orig_dsfield = dsfield;
+                       break;
+               }
        } else {
                if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                        encap_limit = t->parms.encap_limit;
+               if (protocol == IPPROTO_IPV6) {
+                       offset = ip6_tnl_parse_tlv_enc_lim(skb,
+                                               skb_network_header(skb));
+                       /* ip6_tnl_parse_tlv_enc_lim() might have
+                        * reallocated skb->head
+                        */
+                       if (offset > 0) {
+                               struct ipv6_tlv_tnl_enc_lim *tel;
 
-               memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-               fl6.flowi6_proto = IPPROTO_IPIP;
-
-               if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-                       dsfield = ipv4_get_dsfield(iph);
-               else
-                       dsfield = ip6_tclass(t->parms.flowinfo);
-               if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-                       fl6.flowi6_mark = skb->mark;
-               else
-                       fl6.flowi6_mark = t->parms.fwmark;
-       }
-
-       fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
-       dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph));
-
-       if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
-               return -1;
-
-       skb_set_inner_ipproto(skb, IPPROTO_IPIP);
-
-       err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
-                          IPPROTO_IPIP);
-       if (err != 0) {
-               /* XXX: send ICMP error even if DF is not set. */
-               if (err == -EMSGSIZE)
-                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-                                 htonl(mtu));
-               return -1;
-       }
-
-       return 0;
-}
-
-static inline int
-ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-       struct ip6_tnl *t = netdev_priv(dev);
-       struct ipv6hdr *ipv6h;
-       int encap_limit = -1;
-       __u16 offset;
-       struct flowi6 fl6;
-       __u8 dsfield;
-       __u32 mtu;
-       u8 tproto;
-       int err;
-
-       ipv6h = ipv6_hdr(skb);
-       tproto = READ_ONCE(t->parms.proto);
-       if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
-           ip6_tnl_addr_conflict(t, ipv6h))
-               return -1;
-
-       if (t->parms.collect_md) {
-               struct ip_tunnel_info *tun_info;
-               const struct ip_tunnel_key *key;
-
-               tun_info = skb_tunnel_info(skb);
-               if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
-                            ip_tunnel_info_af(tun_info) != AF_INET6))
-                       return -1;
-               key = &tun_info->key;
-               memset(&fl6, 0, sizeof(fl6));
-               fl6.flowi6_proto = IPPROTO_IPV6;
-               fl6.saddr = key->u.ipv6.src;
-               fl6.daddr = key->u.ipv6.dst;
-               fl6.flowlabel = key->label;
-               dsfield = key->tos;
-       } else {
-               offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-               /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
-               ipv6h = ipv6_hdr(skb);
-               if (offset > 0) {
-                       struct ipv6_tlv_tnl_enc_lim *tel;
-
-                       tel = (void *)&skb_network_header(skb)[offset];
-                       if (tel->encap_limit == 0) {
-                               icmpv6_send(skb, ICMPV6_PARAMPROB,
-                                           ICMPV6_HDR_FIELD, offset + 2);
-                               return -1;
+                               tel = (void *)&skb_network_header(skb)[offset];
+                               if (tel->encap_limit == 0) {
+                                       icmpv6_send(skb, ICMPV6_PARAMPROB,
+                                               ICMPV6_HDR_FIELD, offset + 2);
+                                       return -1;
+                               }
+                               encap_limit = tel->encap_limit - 1;
                        }
-                       encap_limit = tel->encap_limit - 1;
-               } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
-                       encap_limit = t->parms.encap_limit;
                }
 
                memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-               fl6.flowi6_proto = IPPROTO_IPV6;
+               fl6.flowi6_proto = protocol;
 
-               if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-                       dsfield = ipv6_get_dsfield(ipv6h);
-               else
-                       dsfield = ip6_tclass(t->parms.flowinfo);
-               if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-                       fl6.flowlabel |= ip6_flowlabel(ipv6h);
                if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
                        fl6.flowi6_mark = skb->mark;
                else
                        fl6.flowi6_mark = t->parms.fwmark;
+               switch (protocol) {
+               case IPPROTO_IPIP:
+                       iph = ip_hdr(skb);
+                       orig_dsfield = ipv4_get_dsfield(iph);
+                       if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+                               dsfield = orig_dsfield;
+                       else
+                               dsfield = ip6_tclass(t->parms.flowinfo);
+                       break;
+               case IPPROTO_IPV6:
+                       ipv6h = ipv6_hdr(skb);
+                       orig_dsfield = ipv6_get_dsfield(ipv6h);
+                       if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+                               dsfield = orig_dsfield;
+                       else
+                               dsfield = ip6_tclass(t->parms.flowinfo);
+                       if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+                               fl6.flowlabel |= ip6_flowlabel(ipv6h);
+                       break;
+               default:
+                       orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo);
+                       break;
+               }
        }
 
        fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
-       dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h));
+       dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield);
 
        if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
                return -1;
 
-       skb_set_inner_ipproto(skb, IPPROTO_IPV6);
-
        err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
-                          IPPROTO_IPV6);
+                          protocol);
        if (err != 0) {
+               /* XXX: send ICMP error even if DF is not set. */
                if (err == -EMSGSIZE)
-                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+                       switch (protocol) {
+                       case IPPROTO_IPIP:
+                               icmp_send(skb, ICMP_DEST_UNREACH,
+                                         ICMP_FRAG_NEEDED, htonl(mtu));
+                               break;
+                       case IPPROTO_IPV6:
+                               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+                               break;
+                       default:
+                               break;
+                       }
                return -1;
        }
 
@@ -1415,6 +1426,7 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ip6_tnl *t = netdev_priv(dev);
        struct net_device_stats *stats = &t->dev->stats;
+       u8 ipproto;
        int ret;
 
        if (!pskb_inet_may_pull(skb))
@@ -1422,15 +1434,21 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
        switch (skb->protocol) {
        case htons(ETH_P_IP):
-               ret = ip4ip6_tnl_xmit(skb, dev);
+               ipproto = IPPROTO_IPIP;
                break;
        case htons(ETH_P_IPV6):
-               ret = ip6ip6_tnl_xmit(skb, dev);
+               if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb)))
+                       goto tx_err;
+               ipproto = IPPROTO_IPV6;
+               break;
+       case htons(ETH_P_MPLS_UC):
+               ipproto = IPPROTO_MPLS;
                break;
        default:
                goto tx_err;
        }
 
+       ret = ipxip6_tnl_xmit(skb, dev, ipproto);
        if (ret < 0)
                goto tx_err;
 
@@ -2218,6 +2236,12 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
        .priority       =       1,
 };
 
+static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
+       .handler        = mplsip6_rcv,
+       .err_handler    = mplsip6_err,
+       .priority       =       1,
+};
+
 static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list)
 {
        struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
@@ -2332,6 +2356,15 @@ static int __init ip6_tunnel_init(void)
                pr_err("%s: can't register ip6ip6\n", __func__);
                goto out_ip6ip6;
        }
+
+       if (ip6_tnl_mpls_supported()) {
+               err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS);
+               if (err < 0) {
+                       pr_err("%s: can't register mplsip6\n", __func__);
+                       goto out_mplsip6;
+               }
+       }
+
        err = rtnl_link_register(&ip6_link_ops);
        if (err < 0)
                goto rtnl_link_failed;
@@ -2339,6 +2372,9 @@ static int __init ip6_tunnel_init(void)
        return 0;
 
 rtnl_link_failed:
+       if (ip6_tnl_mpls_supported())
+               xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS);
+out_mplsip6:
        xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
 out_ip6ip6:
        xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
@@ -2361,6 +2397,9 @@ static void __exit ip6_tunnel_cleanup(void)
        if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
                pr_info("%s: can't deregister ip6ip6\n", __func__);
 
+       if (ip6_tnl_mpls_supported() &&
+           xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS))
+               pr_info("%s: can't deregister mplsip6\n", __func__);
        unregister_pernet_device(&ip6_tnl_net_ops);
 }
 
index a52ec1b86432bed28b6aaf92cb3bd9d4cb85db7e..82cbb46a2a4fe48c328e5c5522d00bb02019335d 100644 (file)
@@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 #ifdef CONFIG_IPV6_ROUTER_PREF
        fib6_nh->last_probe = jiffies;
 #endif
+       if (cfg->fc_is_fdb) {
+               fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+               fib6_nh->fib_nh_gw_family = AF_INET6;
+               return 0;
+       }
 
        err = -ENODEV;
        if (cfg->fc_ifindex) {
index 21e7b95ddbfafd52083bdff2c8f2d3b3978802fc..06c02ebe6b9b517afddf4540196336b7452d0358 100644 (file)
 
 static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
 static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly;
 static DEFINE_MUTEX(tunnel6_mutex);
 
+static inline int xfrm6_tunnel_mpls_supported(void)
+{
+       return IS_ENABLED(CONFIG_MPLS);
+}
+
 int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
 {
        struct xfrm6_tunnel __rcu **pprev;
@@ -32,8 +38,21 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
 
        mutex_lock(&tunnel6_mutex);
 
-       for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
-            (t = rcu_dereference_protected(*pprev,
+       switch (family) {
+       case AF_INET6:
+               pprev = &tunnel6_handlers;
+               break;
+       case AF_INET:
+               pprev = &tunnel46_handlers;
+               break;
+       case AF_MPLS:
+               pprev = &tunnelmpls6_handlers;
+               break;
+       default:
+               goto err;
+       }
+
+       for (; (t = rcu_dereference_protected(*pprev,
                        lockdep_is_held(&tunnel6_mutex))) != NULL;
             pprev = &t->next) {
                if (t->priority > priority)
@@ -62,8 +81,21 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 
        mutex_lock(&tunnel6_mutex);
 
-       for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
-            (t = rcu_dereference_protected(*pprev,
+       switch (family) {
+       case AF_INET6:
+               pprev = &tunnel6_handlers;
+               break;
+       case AF_INET:
+               pprev = &tunnel46_handlers;
+               break;
+       case AF_MPLS:
+               pprev = &tunnelmpls6_handlers;
+               break;
+       default:
+               goto err;
+       }
+
+       for (; (t = rcu_dereference_protected(*pprev,
                        lockdep_is_held(&tunnel6_mutex))) != NULL;
             pprev = &t->next) {
                if (t == handler) {
@@ -73,6 +105,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
                }
        }
 
+err:
        mutex_unlock(&tunnel6_mutex);
 
        synchronize_net();
@@ -86,6 +119,24 @@ EXPORT_SYMBOL(xfrm6_tunnel_deregister);
             handler != NULL;                           \
             handler = rcu_dereference(handler->next))  \
 
+static int tunnelmpls6_rcv(struct sk_buff *skb)
+{
+       struct xfrm6_tunnel *handler;
+
+       if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+               goto drop;
+
+       for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+               if (!handler->handler(skb))
+                       return 0;
+
+       icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+       kfree_skb(skb);
+       return 0;
+}
+
 static int tunnel6_rcv(struct sk_buff *skb)
 {
        struct xfrm6_tunnel *handler;
@@ -146,6 +197,18 @@ static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        return -ENOENT;
 }
 
+static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+                          u8 type, u8 code, int offset, __be32 info)
+{
+       struct xfrm6_tunnel *handler;
+
+       for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+               if (!handler->err_handler(skb, opt, type, code, offset, info))
+                       return 0;
+
+       return -ENOENT;
+}
+
 static const struct inet6_protocol tunnel6_protocol = {
        .handler        = tunnel6_rcv,
        .err_handler    = tunnel6_err,
@@ -158,6 +221,12 @@ static const struct inet6_protocol tunnel46_protocol = {
        .flags          = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
+static const struct inet6_protocol tunnelmpls6_protocol = {
+       .handler        = tunnelmpls6_rcv,
+       .err_handler    = tunnelmpls6_err,
+       .flags          = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
 static int __init tunnel6_init(void)
 {
        if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
@@ -169,6 +238,13 @@ static int __init tunnel6_init(void)
                inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
                return -EAGAIN;
        }
+       if (xfrm6_tunnel_mpls_supported() &&
+           inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) {
+               pr_err("%s: can't add protocol\n", __func__);
+               inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+               inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+               return -EAGAIN;
+       }
        return 0;
 }
 
@@ -178,6 +254,9 @@ static void __exit tunnel6_fini(void)
                pr_err("%s: can't remove protocol\n", __func__);
        if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
                pr_err("%s: can't remove protocol\n", __func__);
+       if (xfrm6_tunnel_mpls_supported() &&
+           inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS))
+               pr_err("%s: can't remove protocol\n", __func__);
 }
 
 module_init(tunnel6_init);
index a42e4ed5ab0e169eb1ab36ee027380e062d4f18b..fd30ea61336ea1a674c62a9d8a0acd5a78bedb22 100644 (file)
@@ -1593,7 +1593,8 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
                    dev->type == ARPHRD_IPGRE ||
                    dev->type == ARPHRD_IP6GRE ||
                    dev->type == ARPHRD_SIT ||
-                   dev->type == ARPHRD_TUNNEL) {
+                   dev->type == ARPHRD_TUNNEL ||
+                   dev->type == ARPHRD_TUNNEL6) {
                        mdev = mpls_add_dev(dev);
                        if (IS_ERR(mdev))
                                return notifier_from_errno(PTR_ERR(mdev));
index 34a74043840bf71685402bcec035e91c8c089c1a..a042261a45c5cf8598eb46c763aad5bb5fe722b0 100644 (file)
@@ -209,6 +209,7 @@ void psample_group_put(struct psample_group *group)
 }
 EXPORT_SYMBOL_GPL(psample_group_put);
 
+#ifdef CONFIG_INET
 static int __psample_ip_tun_to_nlattr(struct sk_buff *skb,
                              struct ip_tunnel_info *tun_info)
 {
@@ -352,12 +353,15 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
 
        return sum;
 }
+#endif
 
 void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
                           u32 trunc_size, int in_ifindex, int out_ifindex,
                           u32 sample_rate)
 {
+#ifdef CONFIG_INET
        struct ip_tunnel_info *tun_info;
+#endif
        struct sk_buff *nl_skb;
        int data_len;
        int meta_len;
@@ -371,9 +375,11 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
                   nla_total_size(sizeof(u32)) +        /* group_num */
                   nla_total_size(sizeof(u32));         /* seq */
 
+#ifdef CONFIG_INET
        tun_info = skb_tunnel_info(skb);
        if (tun_info)
                meta_len += psample_tunnel_meta_len(tun_info);
+#endif
 
        data_len = min(skb->len, trunc_size);
        if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE)
@@ -429,11 +435,13 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
                        goto error;
        }
 
+#ifdef CONFIG_INET
        if (tun_info) {
                ret = psample_ip_tun_to_nlattr(nl_skb, tun_info);
                if (unlikely(ret < 0))
                        goto error;
        }
+#endif
 
        genlmsg_end(nl_skb, data);
        genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
index 71e2bdafb2ced98f8032505ad42376507573a15d..30cdc4315f423b9133f34decf3ea9ce268f66398 100644 (file)
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
 obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o
index 37ace3bc0d4821a2b8fe970ac4d7405bdb9439df..19e59d1a5e9f79ac408012c60ba8d1158837aac0 100644 (file)
@@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
        umem->zc = false;
 }
 
-static void xdp_umem_unmap_pages(struct xdp_umem *umem)
-{
-       unsigned int i;
-
-       for (i = 0; i < umem->npgs; i++)
-               if (PageHighMem(umem->pgs[i]))
-                       vunmap(umem->pages[i].addr);
-}
-
-static int xdp_umem_map_pages(struct xdp_umem *umem)
-{
-       unsigned int i;
-       void *addr;
-
-       for (i = 0; i < umem->npgs; i++) {
-               if (PageHighMem(umem->pgs[i]))
-                       addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
-               else
-                       addr = page_address(umem->pgs[i]);
-
-               if (!addr) {
-                       xdp_umem_unmap_pages(umem);
-                       return -ENOMEM;
-               }
-
-               umem->pages[i].addr = addr;
-       }
-
-       return 0;
-}
-
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
        unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
@@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
                umem->cq = NULL;
        }
 
-       xsk_reuseq_destroy(umem);
-
-       xdp_umem_unmap_pages(umem);
+       xp_destroy(umem->pool);
        xdp_umem_unpin_pages(umem);
 
-       kvfree(umem->pages);
-       umem->pages = NULL;
-
        xdp_umem_unaccount_pages(umem);
        kfree(umem);
 }
@@ -385,11 +349,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
        if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
                return -EINVAL;
 
-       umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
-                                           : ~((u64)chunk_size - 1);
        umem->size = size;
        umem->headroom = headroom;
-       umem->chunk_size_nohr = chunk_size - headroom;
+       umem->chunk_size = chunk_size;
        umem->npgs = size / PAGE_SIZE;
        umem->pgs = NULL;
        umem->user = NULL;
@@ -407,18 +369,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
        if (err)
                goto out_account;
 
-       umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages),
-                              GFP_KERNEL_ACCOUNT);
-       if (!umem->pages) {
+       umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
+                              headroom, size, unaligned_chunks);
+       if (!umem->pool) {
                err = -ENOMEM;
                goto out_pin;
        }
-
-       err = xdp_umem_map_pages(umem);
-       if (!err)
-               return 0;
-
-       kvfree(umem->pages);
+       return 0;
 
 out_pin:
        xdp_umem_unpin_pages(umem);
index a63a9fb251f50dee8598d158b48331da8fb7dc82..32067fe98f6594eb8def1a73a16c084f31176dba 100644 (file)
@@ -6,7 +6,7 @@
 #ifndef XDP_UMEM_H_
 #define XDP_UMEM_H_
 
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
                        u16 queue_id, u16 flags);
index 45ffd67b367d81b8a001cc9426ce2defb785428f..b6c0f08bd80d2e54610a4e3d1a0af4ce9abd6988 100644 (file)
@@ -22,7 +22,7 @@
 #include <linux/net.h>
 #include <linux/netdevice.h>
 #include <linux/rculist.h>
-#include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/xdp.h>
 
 #include "xsk_queue.h"
@@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
                READ_ONCE(xs->umem->fq);
 }
 
-bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
-{
-       return xskq_cons_has_entries(umem->fq, cnt);
-}
-EXPORT_SYMBOL(xsk_umem_has_addrs);
-
-bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
-{
-       return xskq_cons_peek_addr(umem->fq, addr, umem);
-}
-EXPORT_SYMBOL(xsk_umem_peek_addr);
-
-void xsk_umem_release_addr(struct xdp_umem *umem)
-{
-       xskq_cons_release(umem->fq);
-}
-EXPORT_SYMBOL(xsk_umem_release_addr);
-
 void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
 {
        if (umem->need_wakeup & XDP_WAKEUP_RX)
@@ -117,76 +99,82 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
 
-/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
- * each page. This is only required in copy mode.
- */
-static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
-                            u32 len, u32 metalen)
+void xp_release(struct xdp_buff_xsk *xskb)
 {
-       void *to_buf = xdp_umem_get_data(umem, addr);
-
-       addr = xsk_umem_add_offset_to_addr(addr);
-       if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
-               void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
-               u64 page_start = addr & ~(PAGE_SIZE - 1);
-               u64 first_len = PAGE_SIZE - (addr - page_start);
-
-               memcpy(to_buf, from_buf, first_len);
-               memcpy(next_pg_addr, from_buf + first_len,
-                      len + metalen - first_len);
+       xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
+}
 
-               return;
-       }
+static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
+{
+       u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
 
-       memcpy(to_buf, from_buf, len + metalen);
+       offset += xskb->pool->headroom;
+       if (!xskb->pool->unaligned)
+               return xskb->orig_addr + offset;
+       return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
 }
 
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-       u64 offset = xs->umem->headroom;
-       u64 addr, memcpy_addr;
-       void *from_buf;
-       u32 metalen;
+       struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+       u64 addr;
        int err;
 
-       if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
-           len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
+       addr = xp_get_handle(xskb);
+       err = xskq_prod_reserve_desc(xs->rx, addr, len);
+       if (err) {
                xs->rx_dropped++;
-               return -ENOSPC;
+               return err;
        }
 
-       if (unlikely(xdp_data_meta_unsupported(xdp))) {
-               from_buf = xdp->data;
-               metalen = 0;
-       } else {
-               from_buf = xdp->data_meta;
-               metalen = xdp->data - xdp->data_meta;
-       }
+       xp_release(xskb);
+       return 0;
+}
 
-       memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
-       __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
+{
+       void *from_buf, *to_buf;
+       u32 metalen;
 
-       offset += metalen;
-       addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
-       err = xskq_prod_reserve_desc(xs->rx, addr, len);
-       if (!err) {
-               xskq_cons_release(xs->umem->fq);
-               xdp_return_buff(xdp);
-               return 0;
+       if (unlikely(xdp_data_meta_unsupported(from))) {
+               from_buf = from->data;
+               to_buf = to->data;
+               metalen = 0;
+       } else {
+               from_buf = from->data_meta;
+               metalen = from->data - from->data_meta;
+               to_buf = to->data - metalen;
        }
 
-       xs->rx_dropped++;
-       return err;
+       memcpy(to_buf, from_buf, len + metalen);
 }
 
-static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
+                    bool explicit_free)
 {
-       int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
+       struct xdp_buff *xsk_xdp;
+       int err;
 
-       if (err)
+       if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
                xs->rx_dropped++;
+               return -ENOSPC;
+       }
 
-       return err;
+       xsk_xdp = xsk_buff_alloc(xs->umem);
+       if (!xsk_xdp) {
+               xs->rx_dropped++;
+               return -ENOSPC;
+       }
+
+       xsk_copy_xdp(xsk_xdp, xdp, len);
+       err = __xsk_rcv_zc(xs, xsk_xdp, len);
+       if (err) {
+               xsk_buff_free(xsk_xdp);
+               return err;
+       }
+       if (explicit_free)
+               xdp_return_buff(xdp);
+       return 0;
 }
 
 static bool xsk_is_bound(struct xdp_sock *xs)
@@ -199,7 +187,8 @@ static bool xsk_is_bound(struct xdp_sock *xs)
        return false;
 }
 
-static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
+                  bool explicit_free)
 {
        u32 len;
 
@@ -211,8 +200,9 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
        len = xdp->data_end - xdp->data;
 
-       return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
-               __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+       return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
+               __xsk_rcv_zc(xs, xdp, len) :
+               __xsk_rcv(xs, xdp, len, explicit_free);
 }
 
 static void xsk_flush(struct xdp_sock *xs)
@@ -224,46 +214,11 @@ static void xsk_flush(struct xdp_sock *xs)
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-       u32 metalen = xdp->data - xdp->data_meta;
-       u32 len = xdp->data_end - xdp->data;
-       u64 offset = xs->umem->headroom;
-       void *buffer;
-       u64 addr;
        int err;
 
        spin_lock_bh(&xs->rx_lock);
-
-       if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
-               err = -EINVAL;
-               goto out_unlock;
-       }
-
-       if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
-           len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
-               err = -ENOSPC;
-               goto out_drop;
-       }
-
-       addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
-       buffer = xdp_umem_get_data(xs->umem, addr);
-       memcpy(buffer, xdp->data_meta, len + metalen);
-
-       addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
-       err = xskq_prod_reserve_desc(xs->rx, addr, len);
-       if (err)
-               goto out_drop;
-
-       xskq_cons_release(xs->umem->fq);
-       xskq_prod_submit(xs->rx);
-
-       spin_unlock_bh(&xs->rx_lock);
-
-       xs->sk.sk_data_ready(&xs->sk);
-       return 0;
-
-out_drop:
-       xs->rx_dropped++;
-out_unlock:
+       err = xsk_rcv(xs, xdp, false);
+       xsk_flush(xs);
        spin_unlock_bh(&xs->rx_lock);
        return err;
 }
@@ -273,7 +228,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
        struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
        int err;
 
-       err = xsk_rcv(xs, xdp);
+       err = xsk_rcv(xs, xdp, true);
        if (err)
                return err;
 
@@ -404,7 +359,7 @@ static int xsk_generic_xmit(struct sock *sk)
 
                skb_put(skb, len);
                addr = desc.addr;
-               buffer = xdp_umem_get_data(xs->umem, addr);
+               buffer = xsk_buff_raw_get_data(xs->umem, addr);
                err = skb_store_bits(skb, 0, buffer, len);
                /* This is the backpressure mechanism for the Tx path.
                 * Reserve space in the completion queue and only proceed
@@ -629,24 +584,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
        return sock;
 }
 
-/* Check if umem pages are contiguous.
- * If zero-copy mode, use the DMA address to do the page contiguity check
- * For all other modes we use addr (kernel virtual address)
- * Store the result in the low bits of addr.
- */
-static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
-{
-       struct xdp_umem_page *pgs = umem->pages;
-       int i, is_contig;
-
-       for (i = 0; i < umem->npgs - 1; i++) {
-               is_contig = (flags & XDP_ZEROCOPY) ?
-                       (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
-                       (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
-               pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
-       }
-}
-
 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
        struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -729,23 +666,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
                goto out_unlock;
        } else {
                /* This xsk has its own umem. */
-               xskq_set_umem(xs->umem->fq, xs->umem->size,
-                             xs->umem->chunk_mask);
-               xskq_set_umem(xs->umem->cq, xs->umem->size,
-                             xs->umem->chunk_mask);
-
                err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
                if (err)
                        goto out_unlock;
-
-               xsk_check_page_contiguity(xs->umem, flags);
        }
 
        xs->dev = dev;
        xs->zc = xs->umem->zc;
        xs->queue_id = qid;
-       xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
-       xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
        xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
@@ -860,6 +788,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
                q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
                        &xs->umem->cq;
                err = xsk_init_queue(entries, q, true);
+               if (optname == XDP_UMEM_FILL_RING)
+                       xp_set_fq(xs->umem->pool, *q);
                mutex_unlock(&xs->mutex);
                return err;
        }
index 4cfd106bdb5335ce878f936dd61b26f9914ff0b7..455ddd480f3dbed3aaff0da9d1cca899f7df5fa1 100644 (file)
@@ -4,6 +4,20 @@
 #ifndef XSK_H_
 #define XSK_H_
 
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
+
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP BIT(1)
+
 struct xdp_ring_offset_v1 {
        __u64 producer;
        __u64 consumer;
@@ -17,9 +31,25 @@ struct xdp_mmap_offsets_v1 {
        struct xdp_ring_offset_v1 cr;
 };
 
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+
+struct xsk_map_node {
+       struct list_head node;
+       struct xsk_map *map;
+       struct xdp_sock **map_entry;
+};
+
 static inline struct xdp_sock *xdp_sk(struct sock *sk)
 {
        return (struct xdp_sock *)sk;
 }
 
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+                            struct xdp_sock **map_entry);
+int xsk_map_inc(struct xsk_map *map);
+void xsk_map_put(struct xsk_map *map);
+
 #endif /* XSK_H_ */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
new file mode 100644 (file)
index 0000000..540ed75
--- /dev/null
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <net/xsk_buff_pool.h>
+#include <net/xdp_sock.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/swiotlb.h>
+
+#include "xsk_queue.h"
+
+static void xp_addr_unmap(struct xsk_buff_pool *pool)
+{
+       vunmap(pool->addrs);
+}
+
+static int xp_addr_map(struct xsk_buff_pool *pool,
+                      struct page **pages, u32 nr_pages)
+{
+       pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+       if (!pool->addrs)
+               return -ENOMEM;
+       return 0;
+}
+
+void xp_destroy(struct xsk_buff_pool *pool)
+{
+       if (!pool)
+               return;
+
+       xp_addr_unmap(pool);
+       kvfree(pool->heads);
+       kvfree(pool);
+}
+
+struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
+                               u32 chunk_size, u32 headroom, u64 size,
+                               bool unaligned)
+{
+       struct xsk_buff_pool *pool;
+       struct xdp_buff_xsk *xskb;
+       int err;
+       u32 i;
+
+       pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
+       if (!pool)
+               goto out;
+
+       pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
+       if (!pool->heads)
+               goto out;
+
+       pool->chunk_mask = ~((u64)chunk_size - 1);
+       pool->addrs_cnt = size;
+       pool->heads_cnt = chunks;
+       pool->free_heads_cnt = chunks;
+       pool->headroom = headroom;
+       pool->chunk_size = chunk_size;
+       pool->cheap_dma = true;
+       pool->unaligned = unaligned;
+       pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
+       INIT_LIST_HEAD(&pool->free_list);
+
+       for (i = 0; i < pool->free_heads_cnt; i++) {
+               xskb = &pool->heads[i];
+               xskb->pool = pool;
+               xskb->xdp.frame_sz = chunk_size - headroom;
+               pool->free_heads[i] = xskb;
+       }
+
+       err = xp_addr_map(pool, pages, nr_pages);
+       if (!err)
+               return pool;
+
+out:
+       xp_destroy(pool);
+       return NULL;
+}
+
+void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq)
+{
+       pool->fq = fq;
+}
+
+void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
+{
+       u32 i;
+
+       for (i = 0; i < pool->heads_cnt; i++)
+               pool->heads[i].xdp.rxq = rxq;
+}
+EXPORT_SYMBOL(xp_set_rxq_info);
+
+void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
+{
+       dma_addr_t *dma;
+       u32 i;
+
+       if (pool->dma_pages_cnt == 0)
+               return;
+
+       for (i = 0; i < pool->dma_pages_cnt; i++) {
+               dma = &pool->dma_pages[i];
+               if (*dma) {
+                       dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE,
+                                            DMA_BIDIRECTIONAL, attrs);
+                       *dma = 0;
+               }
+       }
+
+       kvfree(pool->dma_pages);
+       pool->dma_pages_cnt = 0;
+       pool->dev = NULL;
+}
+EXPORT_SYMBOL(xp_dma_unmap);
+
+static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
+{
+       u32 i;
+
+       for (i = 0; i < pool->dma_pages_cnt - 1; i++) {
+               if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1])
+                       pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
+               else
+                       pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
+       }
+}
+
+static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_SWIOTLB)
+       phys_addr_t paddr;
+       u32 i;
+
+       for (i = 0; i < pool->dma_pages_cnt; i++) {
+               paddr = dma_to_phys(pool->dev, pool->dma_pages[i]);
+               if (is_swiotlb_buffer(paddr))
+                       return false;
+       }
+#endif
+       return true;
+}
+
+static bool xp_check_cheap_dma(struct xsk_buff_pool *pool)
+{
+#if defined(CONFIG_HAS_DMA)
+       const struct dma_map_ops *ops = get_dma_ops(pool->dev);
+
+       if (ops) {
+               return !ops->sync_single_for_cpu &&
+                       !ops->sync_single_for_device;
+       }
+
+       if (!dma_is_direct(ops))
+               return false;
+
+       if (!xp_check_swiotlb_dma(pool))
+               return false;
+
+       if (!dev_is_dma_coherent(pool->dev)) {
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) ||               \
+       defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) ||        \
+       defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
+               return false;
+#endif
+       }
+#endif
+       return true;
+}
+
+int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
+              unsigned long attrs, struct page **pages, u32 nr_pages)
+{
+       dma_addr_t dma;
+       u32 i;
+
+       pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
+                                  GFP_KERNEL);
+       if (!pool->dma_pages)
+               return -ENOMEM;
+
+       pool->dev = dev;
+       pool->dma_pages_cnt = nr_pages;
+
+       for (i = 0; i < pool->dma_pages_cnt; i++) {
+               dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
+                                        DMA_BIDIRECTIONAL, attrs);
+               if (dma_mapping_error(dev, dma)) {
+                       xp_dma_unmap(pool, attrs);
+                       return -ENOMEM;
+               }
+               pool->dma_pages[i] = dma;
+       }
+
+       if (pool->unaligned)
+               xp_check_dma_contiguity(pool);
+
+       pool->dev = dev;
+       pool->cheap_dma = xp_check_cheap_dma(pool);
+       return 0;
+}
+EXPORT_SYMBOL(xp_dma_map);
+
+static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
+                                         u64 addr)
+{
+       return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
+}
+
+static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+       *addr = xp_unaligned_extract_addr(*addr);
+       if (*addr >= pool->addrs_cnt ||
+           *addr + pool->chunk_size > pool->addrs_cnt ||
+           xp_addr_crosses_non_contig_pg(pool, *addr))
+               return false;
+       return true;
+}
+
+static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
+{
+       *addr = xp_aligned_extract_addr(pool, *addr);
+       return *addr < pool->addrs_cnt;
+}
+
+static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
+{
+       struct xdp_buff_xsk *xskb;
+       u64 addr;
+       bool ok;
+
+       if (pool->free_heads_cnt == 0)
+               return NULL;
+
+       xskb = pool->free_heads[--pool->free_heads_cnt];
+
+       for (;;) {
+               if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
+                       xp_release(xskb);
+                       return NULL;
+               }
+
+               ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+                    xp_check_aligned(pool, &addr);
+               if (!ok) {
+                       pool->fq->invalid_descs++;
+                       xskq_cons_release(pool->fq);
+                       continue;
+               }
+               break;
+       }
+       xskq_cons_release(pool->fq);
+
+       xskb->orig_addr = addr;
+       xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
+       if (pool->dma_pages_cnt) {
+               xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
+                                  ~XSK_NEXT_PG_CONTIG_MASK) +
+                                 (addr & ~PAGE_MASK);
+               xskb->dma = xskb->frame_dma + pool->headroom +
+                           XDP_PACKET_HEADROOM;
+       }
+       return xskb;
+}
+
+struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+{
+       struct xdp_buff_xsk *xskb;
+
+       if (!pool->free_list_cnt) {
+               xskb = __xp_alloc(pool);
+               if (!xskb)
+                       return NULL;
+       } else {
+               pool->free_list_cnt--;
+               xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
+                                       free_list_node);
+               list_del(&xskb->free_list_node);
+       }
+
+       xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+       xskb->xdp.data_meta = xskb->xdp.data;
+
+       if (!pool->cheap_dma) {
+               dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+                                                pool->frame_len,
+                                                DMA_BIDIRECTIONAL);
+       }
+       return &xskb->xdp;
+}
+EXPORT_SYMBOL(xp_alloc);
+
+bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
+{
+       if (pool->free_list_cnt >= count)
+               return true;
+       return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
+}
+EXPORT_SYMBOL(xp_can_alloc);
+
+void xp_free(struct xdp_buff_xsk *xskb)
+{
+       xskb->pool->free_list_cnt++;
+       list_add(&xskb->free_list_node, &xskb->pool->free_list);
+}
+EXPORT_SYMBOL(xp_free);
+
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+{
+       addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+       return pool->addrs + addr;
+}
+EXPORT_SYMBOL(xp_raw_get_data);
+
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+{
+       addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+       return (pool->dma_pages[addr >> PAGE_SHIFT] &
+               ~XSK_NEXT_PG_CONTIG_MASK) +
+               (addr & ~PAGE_MASK);
+}
+EXPORT_SYMBOL(xp_raw_get_dma);
+
+void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
+{
+       dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+                                     xskb->pool->frame_len, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
+
+void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
+                                size_t size)
+{
+       dma_sync_single_range_for_device(pool->dev, dma, 0,
+                                        size, DMA_BIDIRECTIONAL);
+}
+EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
index f59791ba43a04c08e6d486edf418c63e267a0220..0163b26aaf63c74d49c915e88ff15c86b3906e55 100644 (file)
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
        du.id = umem->id;
        du.size = umem->size;
        du.num_pages = umem->npgs;
-       du.chunk_size = umem->chunk_size_nohr + umem->headroom;
+       du.chunk_size = umem->chunk_size;
        du.headroom = umem->headroom;
        du.ifindex = umem->dev ? umem->dev->ifindex : 0;
        du.queue_id = umem->queue_id;
index 57fb81bd593c178fd9ae94adf23beee8d7486956..6cf9586e5027a64c55dde9aa561e1cc662db51c6 100644 (file)
@@ -6,18 +6,10 @@
 #include <linux/log2.h>
 #include <linux/slab.h>
 #include <linux/overflow.h>
+#include <net/xdp_sock_drv.h>
 
 #include "xsk_queue.h"
 
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
-{
-       if (!q)
-               return;
-
-       q->umem_size = umem_size;
-       q->chunk_mask = chunk_mask;
-}
-
 static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
 {
        struct xdp_umem_ring *umem_ring;
@@ -63,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q)
        page_frag_free(q->ring);
        kfree(q);
 }
-
-struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
-{
-       struct xdp_umem_fq_reuse *newq;
-
-       /* Check for overflow */
-       if (nentries > (u32)roundup_pow_of_two(nentries))
-               return NULL;
-       nentries = roundup_pow_of_two(nentries);
-
-       newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
-       if (!newq)
-               return NULL;
-       memset(newq, 0, offsetof(typeof(*newq), handles));
-
-       newq->nentries = nentries;
-       return newq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
-
-struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
-                                         struct xdp_umem_fq_reuse *newq)
-{
-       struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
-
-       if (!oldq) {
-               umem->fq_reuse = newq;
-               return NULL;
-       }
-
-       if (newq->nentries < oldq->length)
-               return newq;
-
-       memcpy(newq->handles, oldq->handles,
-              array_size(oldq->length, sizeof(u64)));
-       newq->length = oldq->length;
-
-       umem->fq_reuse = newq;
-       return oldq;
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
-
-void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
-{
-       kvfree(rq);
-}
-EXPORT_SYMBOL_GPL(xsk_reuseq_free);
-
-void xsk_reuseq_destroy(struct xdp_umem *umem)
-{
-       xsk_reuseq_free(umem->fq_reuse);
-       umem->fq_reuse = NULL;
-}
index 648733ec24ac2ef76bf0673e9bb0441cbe4afe3e..5b5d24d2dd379af42037b15a37f523e855a94f06 100644 (file)
@@ -9,6 +9,9 @@
 #include <linux/types.h>
 #include <linux/if_xdp.h>
 #include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
+
+#include "xsk.h"
 
 struct xdp_ring {
        u32 producer ____cacheline_aligned_in_smp;
@@ -29,8 +32,6 @@ struct xdp_umem_ring {
 };
 
 struct xsk_queue {
-       u64 chunk_mask;
-       u64 umem_size;
        u32 ring_mask;
        u32 nentries;
        u32 cached_prod;
@@ -103,98 +104,73 @@ struct xsk_queue {
 
 /* Functions that read and validate content from consumer rings. */
 
-static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem,
-                                                  u64 addr,
-                                                  u64 length)
+static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
 {
-       bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
-       bool next_pg_contig =
-               (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
-                       XSK_NEXT_PG_CONTIG_MASK;
-
-       return cross_pg && !next_pg_contig;
-}
+       struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
-                                               u64 addr,
-                                               u64 length,
-                                               struct xdp_umem *umem)
-{
-       u64 base_addr = xsk_umem_extract_addr(addr);
+       if (q->cached_cons != q->cached_prod) {
+               u32 idx = q->cached_cons & q->ring_mask;
 
-       addr = xsk_umem_add_offset_to_addr(addr);
-       if (base_addr >= q->umem_size || addr >= q->umem_size ||
-           xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
-               q->invalid_descs++;
-               return false;
+               *addr = ring->desc[idx];
+               return true;
        }
 
-       return true;
+       return false;
 }
 
-static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
+static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
+                                           struct xdp_desc *desc)
 {
-       if (addr >= q->umem_size) {
-               q->invalid_descs++;
+       u64 chunk, chunk_end;
+
+       chunk = xp_aligned_extract_addr(pool, desc->addr);
+       chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
+       if (chunk != chunk_end)
+               return false;
+
+       if (chunk >= pool->addrs_cnt)
                return false;
-       }
 
+       if (desc->options)
+               return false;
        return true;
 }
 
-static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr,
-                                      struct xdp_umem *umem)
+static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
+                                             struct xdp_desc *desc)
 {
-       struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
-       while (q->cached_cons != q->cached_prod) {
-               u32 idx = q->cached_cons & q->ring_mask;
+       u64 addr, base_addr;
 
-               *addr = ring->desc[idx] & q->chunk_mask;
+       base_addr = xp_unaligned_extract_addr(desc->addr);
+       addr = xp_unaligned_add_offset_to_addr(desc->addr);
 
-               if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
-                       if (xskq_cons_is_valid_unaligned(q, *addr,
-                                                        umem->chunk_size_nohr,
-                                                        umem))
-                               return true;
-                       goto out;
-               }
+       if (desc->len > pool->chunk_size)
+               return false;
 
-               if (xskq_cons_is_valid_addr(q, *addr))
-                       return true;
+       if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+           xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+               return false;
 
-out:
-               q->cached_cons++;
-       }
+       if (desc->options)
+               return false;
+       return true;
+}
 
-       return false;
+static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
+                                   struct xdp_desc *desc)
+{
+       return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
+               xp_aligned_validate_desc(pool, desc);
 }
 
 static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
                                           struct xdp_desc *d,
                                           struct xdp_umem *umem)
 {
-       if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
-               if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem))
-                       return false;
-
-               if (d->len > umem->chunk_size_nohr || d->options) {
-                       q->invalid_descs++;
-                       return false;
-               }
-
-               return true;
-       }
-
-       if (!xskq_cons_is_valid_addr(q, d->addr))
-               return false;
-
-       if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
-           d->options) {
+       if (!xp_validate_desc(umem->pool, d)) {
                q->invalid_descs++;
                return false;
        }
-
        return true;
 }
 
@@ -250,12 +226,11 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
        return entries >= cnt;
 }
 
-static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
-                                      struct xdp_umem *umem)
+static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
 {
        if (q->cached_prod == q->cached_cons)
                xskq_cons_get_entries(q);
-       return xskq_cons_read_addr(q, addr, umem);
+       return xskq_cons_read_addr_unchecked(q, addr);
 }
 
 static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
@@ -379,11 +354,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
        return q ? q->invalid_descs : 0;
 }
 
-void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q_ops);
 
-/* Executed by the core when the entire UMEM gets freed */
-void xsk_reuseq_destroy(struct xdp_umem *umem);
-
 #endif /* _LINUX_XSK_QUEUE_H */
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
new file mode 100644 (file)
index 0000000..1dc7208
--- /dev/null
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "xsk.h"
+
+int xsk_map_inc(struct xsk_map *map)
+{
+       bpf_map_inc(&map->map);
+       return 0;
+}
+
+void xsk_map_put(struct xsk_map *map)
+{
+       bpf_map_put(&map->map);
+}
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+                                              struct xdp_sock **map_entry)
+{
+       struct xsk_map_node *node;
+       int err;
+
+       node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+       if (!node)
+               return ERR_PTR(-ENOMEM);
+
+       err = xsk_map_inc(map);
+       if (err) {
+               kfree(node);
+               return ERR_PTR(err);
+       }
+
+       node->map = map;
+       node->map_entry = map_entry;
+       return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+       xsk_map_put(node->map);
+       kfree(node);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+       spin_lock_bh(&xs->map_list_lock);
+       list_add_tail(&node->node, &xs->map_list);
+       spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+                               struct xdp_sock **map_entry)
+{
+       struct xsk_map_node *n, *tmp;
+
+       spin_lock_bh(&xs->map_list_lock);
+       list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+               if (map_entry == n->map_entry) {
+                       list_del(&n->node);
+                       xsk_map_node_free(n);
+               }
+       }
+       spin_unlock_bh(&xs->map_list_lock);
+}
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+       struct bpf_map_memory mem;
+       int err, numa_node;
+       struct xsk_map *m;
+       u64 size;
+
+       if (!capable(CAP_NET_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       if (attr->max_entries == 0 || attr->key_size != 4 ||
+           attr->value_size != 4 ||
+           attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+               return ERR_PTR(-EINVAL);
+
+       numa_node = bpf_map_attr_numa_node(attr);
+       size = struct_size(m, xsk_map, attr->max_entries);
+
+       err = bpf_map_charge_init(&mem, size);
+       if (err < 0)
+               return ERR_PTR(err);
+
+       m = bpf_map_area_alloc(size, numa_node);
+       if (!m) {
+               bpf_map_charge_finish(&mem);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       bpf_map_init_from_attr(&m->map, attr);
+       bpf_map_charge_move(&m->map.memory, &mem);
+       spin_lock_init(&m->lock);
+
+       return &m->map;
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+       struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+       bpf_clear_redirect_map(map);
+       synchronize_net();
+       bpf_map_area_free(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+       struct xsk_map *m = container_of(map, struct xsk_map, map);
+       u32 index = key ? *(u32 *)key : U32_MAX;
+       u32 *next = next_key;
+
+       if (index >= m->map.max_entries) {
+               *next = 0;
+               return 0;
+       }
+
+       if (index == m->map.max_entries - 1)
+               return -ENOENT;
+       *next = index + 1;
+       return 0;
+}
+
+static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+       const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
+       struct bpf_insn *insn = insn_buf;
+
+       *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+       *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+       *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
+       *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
+       *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
+       *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
+       *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+       *insn++ = BPF_MOV64_IMM(ret, 0);
+       return insn - insn_buf;
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
+       return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+                              u64 map_flags)
+{
+       struct xsk_map *m = container_of(map, struct xsk_map, map);
+       struct xdp_sock *xs, *old_xs, **map_entry;
+       u32 i = *(u32 *)key, fd = *(u32 *)value;
+       struct xsk_map_node *node;
+       struct socket *sock;
+       int err;
+
+       if (unlikely(map_flags > BPF_EXIST))
+               return -EINVAL;
+       if (unlikely(i >= m->map.max_entries))
+               return -E2BIG;
+
+       sock = sockfd_lookup(fd, &err);
+       if (!sock)
+               return err;
+
+       if (sock->sk->sk_family != PF_XDP) {
+               sockfd_put(sock);
+               return -EOPNOTSUPP;
+       }
+
+       xs = (struct xdp_sock *)sock->sk;
+
+       if (!xsk_is_setup_for_bpf_map(xs)) {
+               sockfd_put(sock);
+               return -EOPNOTSUPP;
+       }
+
+       map_entry = &m->xsk_map[i];
+       node = xsk_map_node_alloc(m, map_entry);
+       if (IS_ERR(node)) {
+               sockfd_put(sock);
+               return PTR_ERR(node);
+       }
+
+       spin_lock_bh(&m->lock);
+       old_xs = READ_ONCE(*map_entry);
+       if (old_xs == xs) {
+               err = 0;
+               goto out;
+       } else if (old_xs && map_flags == BPF_NOEXIST) {
+               err = -EEXIST;
+               goto out;
+       } else if (!old_xs && map_flags == BPF_EXIST) {
+               err = -ENOENT;
+               goto out;
+       }
+       xsk_map_sock_add(xs, node);
+       WRITE_ONCE(*map_entry, xs);
+       if (old_xs)
+               xsk_map_sock_delete(old_xs, map_entry);
+       spin_unlock_bh(&m->lock);
+       sockfd_put(sock);
+       return 0;
+
+out:
+       spin_unlock_bh(&m->lock);
+       sockfd_put(sock);
+       xsk_map_node_free(node);
+       return err;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+       struct xsk_map *m = container_of(map, struct xsk_map, map);
+       struct xdp_sock *old_xs, **map_entry;
+       int k = *(u32 *)key;
+
+       if (k >= map->max_entries)
+               return -EINVAL;
+
+       spin_lock_bh(&m->lock);
+       map_entry = &m->xsk_map[k];
+       old_xs = xchg(map_entry, NULL);
+       if (old_xs)
+               xsk_map_sock_delete(old_xs, map_entry);
+       spin_unlock_bh(&m->lock);
+
+       return 0;
+}
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+                            struct xdp_sock **map_entry)
+{
+       spin_lock_bh(&map->lock);
+       if (READ_ONCE(*map_entry) == xs) {
+               WRITE_ONCE(*map_entry, NULL);
+               xsk_map_sock_delete(xs, map_entry);
+       }
+       spin_unlock_bh(&map->lock);
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+       .map_alloc = xsk_map_alloc,
+       .map_free = xsk_map_free,
+       .map_get_next_key = xsk_map_get_next_key,
+       .map_lookup_elem = xsk_map_lookup_elem,
+       .map_gen_lookup = xsk_map_gen_lookup,
+       .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
+       .map_update_elem = xsk_map_update_elem,
+       .map_delete_elem = xsk_map_delete_elem,
+       .map_check_btf = map_check_no_btf,
+};
index 23837f2ed45899bcdc892eaf35a45382aec9296d..034800c4d1e6208324ce30f9cadb1147ec6eea16 100644 (file)
@@ -50,3 +50,4 @@ xdp_rxq_info
 xdp_sample_pkts
 xdp_tx_iptunnel
 xdpsock
+testfile.img
index 424f6fe7ce382871f704ebec239a3bd5745c0a5d..8403e47623062997474c0d829126a8930312b87f 100644 (file)
@@ -63,14 +63,14 @@ TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 fds_example-objs := fds_example.o
 sockex1-objs := sockex1_user.o
 sockex2-objs := sockex2_user.o
-sockex3-objs := bpf_load.o sockex3_user.o
-tracex1-objs := bpf_load.o tracex1_user.o $(TRACE_HELPERS)
-tracex2-objs := bpf_load.o tracex2_user.o
-tracex3-objs := bpf_load.o tracex3_user.o
-tracex4-objs := bpf_load.o tracex4_user.o
-tracex5-objs := bpf_load.o tracex5_user.o $(TRACE_HELPERS)
-tracex6-objs := bpf_load.o tracex6_user.o
-tracex7-objs := bpf_load.o tracex7_user.o
+sockex3-objs := sockex3_user.o
+tracex1-objs := tracex1_user.o $(TRACE_HELPERS)
+tracex2-objs := tracex2_user.o
+tracex3-objs := tracex3_user.o
+tracex4-objs := tracex4_user.o
+tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
+tracex6-objs := tracex6_user.o
+tracex7-objs := tracex7_user.o
 test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
 trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
 lathist-objs := bpf_load.o lathist_user.o
index e504dc308371b7ce5ca42b171330b7609f41a59d..f24806ac24e749690684afaccd3a46fa8bed28ed 100644 (file)
 
 #define MAX_IPS                8192
 
-struct bpf_map_def SEC("maps") ip_map = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(u64),
-       .value_size = sizeof(u32),
-       .max_entries = MAX_IPS,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, u64);
+       __type(value, u32);
+       __uint(max_entries, MAX_IPS);
+} ip_map SEC(".maps");
 
 SEC("perf_event")
 int do_sample(struct bpf_perf_event_data *ctx)
index 4372d2da2f9e48bf2aab12f5a788bf5434501d40..921c505bb5678fb6b3433eaef420887621025a97 100644 (file)
@@ -18,9 +18,6 @@
 #include "perf-sys.h"
 #include "trace_helpers.h"
 
-#define __must_check
-#include <linux/err.h>
-
 #define DEFAULT_FREQ   99
 #define DEFAULT_SECS   5
 #define MAX_IPS                8192
@@ -57,7 +54,7 @@ static int sampling_start(int freq, struct bpf_program *prog,
                        return 1;
                }
                links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
-               if (IS_ERR(links[i])) {
+               if (libbpf_get_error(links[i])) {
                        fprintf(stderr, "ERROR: Attach perf event\n");
                        links[i] = NULL;
                        close(pmu_fd);
@@ -182,7 +179,7 @@ int main(int argc, char **argv)
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
        obj = bpf_object__open_file(filename, NULL);
-       if (IS_ERR(obj)) {
+       if (libbpf_get_error(obj)) {
                fprintf(stderr, "ERROR: opening BPF object file failed\n");
                obj = NULL;
                goto cleanup;
index 779a5249c418c060567b2638a8cc7e73c753f3ff..cab9cca0b8ebf69f570639824ae6d688d86f2758 100644 (file)
 
 #define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
 
-struct bpf_map_def SEC("maps") jmp_table = {
-       .type = BPF_MAP_TYPE_PROG_ARRAY,
-       .key_size = sizeof(u32),
-       .value_size = sizeof(u32),
-       .max_entries = 8,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+       __uint(key_size, sizeof(u32));
+       __uint(value_size, sizeof(u32));
+       __uint(max_entries, 8);
+} jmp_table SEC(".maps");
 
 #define PARSE_VLAN 1
 #define PARSE_MPLS 2
@@ -92,12 +92,12 @@ struct globals {
        struct flow_key_record flow;
 };
 
-struct bpf_map_def SEC("maps") percpu_map = {
-       .type = BPF_MAP_TYPE_ARRAY,
-       .key_size = sizeof(__u32),
-       .value_size = sizeof(struct globals),
-       .max_entries = 32,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __type(key, __u32);
+       __type(value, struct globals);
+       __uint(max_entries, 32);
+} percpu_map SEC(".maps");
 
 /* user poor man's per_cpu until native support is ready */
 static struct globals *this_cpu_globals(void)
@@ -113,12 +113,12 @@ struct pair {
        __u64 bytes;
 };
 
-struct bpf_map_def SEC("maps") hash_map = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(struct flow_key_record),
-       .value_size = sizeof(struct pair),
-       .max_entries = 1024,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, struct flow_key_record);
+       __type(value, struct pair);
+       __uint(max_entries, 1024);
+} hash_map SEC(".maps");
 
 static void update_stats(struct __sk_buff *skb, struct globals *g)
 {
index bbb1cd0666a9bf8e50f0d416afbdbaddf6da814c..4dbee7427d470706b4284d9e6bae0a698cfbfde8 100644 (file)
@@ -1,18 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <stdio.h>
 #include <assert.h>
-#include <linux/bpf.h>
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 #include "sock_example.h"
 #include <unistd.h>
 #include <arpa/inet.h>
 #include <sys/resource.h>
 
-#define PARSE_IP 3
-#define PARSE_IP_PROG_FD (prog_fd[0])
-#define PROG_ARRAY_FD (map_fd[0])
-
 struct flow_key_record {
        __be32 src;
        __be32 dst;
@@ -30,31 +25,55 @@ struct pair {
 
 int main(int argc, char **argv)
 {
+       int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd;
        struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+       struct bpf_program *prog;
+       struct bpf_object *obj;
        char filename[256];
+       const char *title;
        FILE *f;
-       int i, sock, err, id, key = PARSE_IP;
-       struct bpf_prog_info info = {};
-       uint32_t info_len = sizeof(info);
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
        setrlimit(RLIMIT_MEMLOCK, &r);
 
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
+
+       jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table");
+       hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
+       if (jmp_table_fd < 0 || hash_map_fd < 0) {
+               fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+               goto cleanup;
        }
 
-       /* Test fd array lookup which returns the id of the bpf_prog */
-       err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len);
-       assert(!err);
-       err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id);
-       assert(!err);
-       assert(id == info.id);
+       bpf_object__for_each_program(prog, obj) {
+               fd = bpf_program__fd(prog);
+
+               title = bpf_program__title(prog, false);
+               if (sscanf(title, "socket/%d", &key) != 1) {
+                       fprintf(stderr, "ERROR: finding prog failed\n");
+                       goto cleanup;
+               }
+
+               if (key == 0)
+                       main_prog_fd = fd;
+               else
+                       bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY);
+       }
 
        sock = open_raw_sock("lo");
 
-       assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4],
+       /* attach BPF program to socket */
+       assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd,
                          sizeof(__u32)) == 0);
 
        if (argc > 1)
@@ -69,8 +88,8 @@ int main(int argc, char **argv)
 
                sleep(1);
                printf("IP     src.port -> dst.port               bytes      packets\n");
-               while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) {
-                       bpf_map_lookup_elem(map_fd[2], &next_key, &value);
+               while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) {
+                       bpf_map_lookup_elem(hash_map_fd, &next_key, &value);
                        printf("%s.%05d -> %s.%05d %12lld %12lld\n",
                               inet_ntoa((struct in_addr){htonl(next_key.src)}),
                               next_key.port16[0],
@@ -80,5 +99,8 @@ int main(int argc, char **argv)
                        key = next_key;
                }
        }
+
+cleanup:
+       bpf_object__close(obj);
        return 0;
 }
diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h
new file mode 100644 (file)
index 0000000..8cb5400
--- /dev/null
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __TRACE_COMMON_H
+#define __TRACE_COMMON_H
+
+#ifdef __x86_64__
+#define SYSCALL(SYS) "__x64_" __stringify(SYS)
+#elif defined(__s390x__)
+#define SYSCALL(SYS) "__s390x_" __stringify(SYS)
+#else
+#define SYSCALL(SYS)  __stringify(SYS)
+#endif
+
+#endif
index da1d69e206452731f613665e971b6cc1da65a3dc..7d3c66fb3f88b15980039d7c18d50f2690c2db39 100644 (file)
@@ -18,19 +18,19 @@ struct key_t {
        u32 userstack;
 };
 
-struct bpf_map_def SEC("maps") counts = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(struct key_t),
-       .value_size = sizeof(u64),
-       .max_entries = 10000,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, struct key_t);
+       __type(value, u64);
+       __uint(max_entries, 10000);
+} counts SEC(".maps");
 
-struct bpf_map_def SEC("maps") stackmap = {
-       .type = BPF_MAP_TYPE_STACK_TRACE,
-       .key_size = sizeof(u32),
-       .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
-       .max_entries = 10000,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+       __uint(key_size, sizeof(u32));
+       __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+       __uint(max_entries, 10000);
+} stackmap SEC(".maps");
 
 #define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
 #define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
index b6cd358d0418e8b47342d7d72d8a84c467fcd130..ac1ba368195cb42474859d60c386366c9c812b45 100644 (file)
@@ -16,9 +16,6 @@
 #include "perf-sys.h"
 #include "trace_helpers.h"
 
-#define __must_check
-#include <linux/err.h>
-
 #define SAMPLE_FREQ 50
 
 static int pid;
@@ -159,7 +156,7 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
                        goto all_cpu_err;
                }
                links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
-               if (IS_ERR(links[i])) {
+               if (libbpf_get_error(links[i])) {
                        printf("bpf_program__attach_perf_event failed\n");
                        links[i] = NULL;
                        close(pmu_fd);
@@ -198,7 +195,7 @@ static void test_perf_event_task(struct perf_event_attr *attr)
                goto err;
        }
        link = bpf_program__attach_perf_event(prog, pmu_fd);
-       if (IS_ERR(link)) {
+       if (libbpf_get_error(link)) {
                printf("bpf_program__attach_perf_event failed\n");
                link = NULL;
                close(pmu_fd);
@@ -314,7 +311,7 @@ int main(int argc, char **argv)
        }
 
        obj = bpf_object__open_file(filename, NULL);
-       if (IS_ERR(obj)) {
+       if (libbpf_get_error(obj)) {
                printf("opening BPF object file failed\n");
                obj = NULL;
                goto cleanup;
index 55fddbd0870292390dc8f3ae9e60387e3ba38cda..9d4adb7fd8341e702a710e6904f53b5f9a970454 100644 (file)
@@ -1,21 +1,41 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <stdio.h>
-#include <linux/bpf.h>
 #include <unistd.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 #include "trace_helpers.h"
 
 int main(int ac, char **argv)
 {
-       FILE *f;
+       struct bpf_link *link = NULL;
+       struct bpf_program *prog;
+       struct bpf_object *obj;
        char filename[256];
+       FILE *f;
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
+
+       prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+       if (!prog) {
+               fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+               goto cleanup;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
 
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
+       link = bpf_program__attach(prog);
+       if (libbpf_get_error(link)) {
+               fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+               link = NULL;
+               goto cleanup;
        }
 
        f = popen("taskset 1 ping -c5 localhost", "r");
@@ -23,5 +43,8 @@ int main(int ac, char **argv)
 
        read_trace_pipe();
 
+cleanup:
+       bpf_link__destroy(link);
+       bpf_object__close(obj);
        return 0;
 }
index d865bb309bcb536ed9a34c66988e0a5a35f2cf43..5bc696bac27d767ee12dad11874c9d97b2880e88 100644 (file)
 #include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include "trace_common.h"
 
-struct bpf_map_def SEC("maps") my_map = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(long),
-       .value_size = sizeof(long),
-       .max_entries = 1024,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, long);
+       __type(value, long);
+       __uint(max_entries, 1024);
+} my_map SEC(".maps");
 
 /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
  * example will no longer be meaningful
@@ -70,14 +71,14 @@ struct hist_key {
        u64 index;
 };
 
-struct bpf_map_def SEC("maps") my_hist_map = {
-       .type = BPF_MAP_TYPE_PERCPU_HASH,
-       .key_size = sizeof(struct hist_key),
-       .value_size = sizeof(long),
-       .max_entries = 1024,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+       __uint(key_size, sizeof(struct hist_key));
+       __uint(value_size, sizeof(long));
+       __uint(max_entries, 1024);
+} my_hist_map SEC(".maps");
 
-SEC("kprobe/sys_write")
+SEC("kprobe/" SYSCALL(sys_write))
 int bpf_prog3(struct pt_regs *ctx)
 {
        long write_size = PT_REGS_PARM3(ctx);
index c9544a4ce61af2c6b1e988ef92811907978e16c3..3e36b3e4e3efdb282abb08262ce1a86a880fce19 100644 (file)
@@ -3,17 +3,19 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <signal.h>
-#include <linux/bpf.h>
 #include <string.h>
 #include <sys/resource.h>
 
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 #include "bpf_util.h"
 
 #define MAX_INDEX      64
 #define MAX_STARS      38
 
+/* my_map, my_hist_map */
+static int map_fd[2];
+
 static void stars(char *str, long val, long max, int width)
 {
        int i;
@@ -115,18 +117,39 @@ static void int_exit(int sig)
 int main(int ac, char **argv)
 {
        struct rlimit r = {1024*1024, RLIM_INFINITY};
-       char filename[256];
        long key, next_key, value;
+       struct bpf_link *links[2];
+       struct bpf_program *prog;
+       struct bpf_object *obj;
+       char filename[256];
+       int i, j = 0;
        FILE *f;
-       int i;
-
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
        if (setrlimit(RLIMIT_MEMLOCK, &r)) {
                perror("setrlimit(RLIMIT_MEMLOCK)");
                return 1;
        }
 
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
+
+       map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map");
+       map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map");
+       if (map_fd[0] < 0 || map_fd[1] < 0) {
+               fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+               goto cleanup;
+       }
+
        signal(SIGINT, int_exit);
        signal(SIGTERM, int_exit);
 
@@ -138,9 +161,14 @@ int main(int ac, char **argv)
        f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
        (void) f;
 
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
+       bpf_object__for_each_program(prog, obj) {
+               links[j] = bpf_program__attach(prog);
+               if (libbpf_get_error(links[j])) {
+                       fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+                       links[j] = NULL;
+                       goto cleanup;
+               }
+               j++;
        }
 
        for (i = 0; i < 5; i++) {
@@ -156,5 +184,10 @@ int main(int ac, char **argv)
        }
        print_hist(map_fd[1]);
 
+cleanup:
+       for (j--; j >= 0; j--)
+               bpf_link__destroy(links[j]);
+
+       bpf_object__close(obj);
        return 0;
 }
index fe21c14feb8da583a3656a81f4d77d2593b0b3b3..659613c19a82ae5e07ee0e90f156165fba3ed32c 100644 (file)
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-struct bpf_map_def SEC("maps") my_map = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(long),
-       .value_size = sizeof(u64),
-       .max_entries = 4096,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, long);
+       __type(value, u64);
+       __uint(max_entries, 4096);
+} my_map SEC(".maps");
 
 /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
  * example will no longer be meaningful
@@ -42,12 +42,12 @@ static unsigned int log2l(unsigned long long n)
 
 #define SLOTS 100
 
-struct bpf_map_def SEC("maps") lat_map = {
-       .type = BPF_MAP_TYPE_PERCPU_ARRAY,
-       .key_size = sizeof(u32),
-       .value_size = sizeof(u64),
-       .max_entries = SLOTS,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __uint(key_size, sizeof(u32));
+       __uint(value_size, sizeof(u64));
+       __uint(max_entries, SLOTS);
+} lat_map SEC(".maps");
 
 SEC("kprobe/blk_account_io_completion")
 int bpf_prog2(struct pt_regs *ctx)
index cf8fedc773f28eccbf9b6f2b65e19a35fe28692d..70e987775c156f90e9c9a383fe1252a6c37b249f 100644 (file)
@@ -7,11 +7,10 @@
 #include <unistd.h>
 #include <stdbool.h>
 #include <string.h>
-#include <linux/bpf.h>
 #include <sys/resource.h>
 
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 #include "bpf_util.h"
 
 #define SLOTS 100
@@ -109,20 +108,11 @@ static void print_hist(int fd)
 int main(int ac, char **argv)
 {
        struct rlimit r = {1024*1024, RLIM_INFINITY};
+       struct bpf_link *links[2];
+       struct bpf_program *prog;
+       struct bpf_object *obj;
        char filename[256];
-       int i;
-
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-       if (setrlimit(RLIMIT_MEMLOCK, &r)) {
-               perror("setrlimit(RLIMIT_MEMLOCK)");
-               return 1;
-       }
-
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
-       }
+       int map_fd, i, j = 0;
 
        for (i = 1; i < ac; i++) {
                if (strcmp(argv[i], "-a") == 0) {
@@ -137,6 +127,40 @@ int main(int ac, char **argv)
                }
        }
 
+       if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+               perror("setrlimit(RLIMIT_MEMLOCK)");
+               return 1;
+       }
+
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
+
+       map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map");
+       if (map_fd < 0) {
+               fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+               goto cleanup;
+       }
+
+       bpf_object__for_each_program(prog, obj) {
+               links[j] = bpf_program__attach(prog);
+               if (libbpf_get_error(links[j])) {
+                       fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+                       links[j] = NULL;
+                       goto cleanup;
+               }
+               j++;
+       }
+
        printf("  heatmap of IO latency\n");
        if (text_only)
                printf("  %s", sym[num_colors - 1]);
@@ -153,9 +177,14 @@ int main(int ac, char **argv)
        for (i = 0; ; i++) {
                if (i % 20 == 0)
                        print_banner();
-               print_hist(map_fd[1]);
+               print_hist(map_fd);
                sleep(2);
        }
 
+cleanup:
+       for (j--; j >= 0; j--)
+               bpf_link__destroy(links[j]);
+
+       bpf_object__close(obj);
        return 0;
 }
index b1bb9df88f8e1609f56719e0bfa6ddb484b51496..eb0f8fdd14bf5b351daaca53359df335abd06421 100644 (file)
@@ -15,12 +15,12 @@ struct pair {
        u64 ip;
 };
 
-struct bpf_map_def SEC("maps") my_map = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(long),
-       .value_size = sizeof(struct pair),
-       .max_entries = 1000000,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, long);
+       __type(value, struct pair);
+       __uint(max_entries, 1000000);
+} my_map SEC(".maps");
 
 /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
  * example will no longer be meaningful
index ec52203fce39a7bab3b52eec5847f037b77d296e..e8faf8f184ae1a64f5d7fb39139079934fcf976d 100644 (file)
@@ -8,11 +8,10 @@
 #include <stdbool.h>
 #include <string.h>
 #include <time.h>
-#include <linux/bpf.h>
 #include <sys/resource.h>
 
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 
 struct pair {
        long long val;
@@ -36,8 +35,8 @@ static void print_old_objects(int fd)
        key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
 
        key = -1;
-       while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
-               bpf_map_lookup_elem(map_fd[0], &next_key, &v);
+       while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+               bpf_map_lookup_elem(fd, &next_key, &v);
                key = next_key;
                if (val - v.val < 1000000000ll)
                        /* object was allocated more then 1 sec ago */
@@ -50,25 +49,55 @@ static void print_old_objects(int fd)
 int main(int ac, char **argv)
 {
        struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+       struct bpf_link *links[2];
+       struct bpf_program *prog;
+       struct bpf_object *obj;
        char filename[256];
-       int i;
-
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       int map_fd, i, j = 0;
 
        if (setrlimit(RLIMIT_MEMLOCK, &r)) {
                perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
                return 1;
        }
 
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
+
+       map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+       if (map_fd < 0) {
+               fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+               goto cleanup;
+       }
+
+       bpf_object__for_each_program(prog, obj) {
+               links[j] = bpf_program__attach(prog);
+               if (libbpf_get_error(links[j])) {
+                       fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+                       links[j] = NULL;
+                       goto cleanup;
+               }
+               j++;
        }
 
        for (i = 0; ; i++) {
-               print_old_objects(map_fd[1]);
+               print_old_objects(map_fd);
                sleep(1);
        }
 
+cleanup:
+       for (j--; j >= 0; j--)
+               bpf_link__destroy(links[j]);
+
+       bpf_object__close(obj);
        return 0;
 }
index 481790fde8645510d10379e95e47ffefde1be7b2..32b49e8ab6bd0d23f8408426ff823e6a2f321894 100644 (file)
 
 #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
 
-struct bpf_map_def SEC("maps") progs = {
-       .type = BPF_MAP_TYPE_PROG_ARRAY,
-       .key_size = sizeof(u32),
-       .value_size = sizeof(u32),
+struct {
+       __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+       __uint(key_size, sizeof(u32));
+       __uint(value_size, sizeof(u32));
 #ifdef __mips__
-       .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */
+       __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */
 #else
-       .max_entries = 1024,
+       __uint(max_entries, 1024);
 #endif
-};
+} progs SEC(".maps");
 
 SEC("kprobe/__seccomp_filter")
 int bpf_prog1(struct pt_regs *ctx)
index c2317b39e0d25d5812ff6800553cd6bac9044b4c..98dad57a96c4e1b77891b0a699363beb90fe55ee 100644 (file)
@@ -1,15 +1,21 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <stdio.h>
-#include <linux/bpf.h>
+#include <stdlib.h>
 #include <unistd.h>
 #include <linux/filter.h>
 #include <linux/seccomp.h>
 #include <sys/prctl.h>
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 #include <sys/resource.h>
 #include "trace_helpers.h"
 
+#ifdef __mips__
+#define        MAX_ENTRIES  6000 /* MIPS n64 syscalls start at 5000 */
+#else
+#define        MAX_ENTRIES  1024
+#endif
+
 /* install fake seccomp program to enable seccomp code path inside the kernel,
  * so that our kprobe attached to seccomp_phase1() can be triggered
  */
@@ -28,16 +34,57 @@ static void install_accept_all_seccomp(void)
 
 int main(int ac, char **argv)
 {
-       FILE *f;
-       char filename[256];
        struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+       struct bpf_link *link = NULL;
+       struct bpf_program *prog;
+       struct bpf_object *obj;
+       int key, fd, progs_fd;
+       char filename[256];
+       const char *title;
+       FILE *f;
 
-       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
        setrlimit(RLIMIT_MEMLOCK, &r);
 
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
+
+       prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+       if (!prog) {
+               printf("finding a prog in obj file failed\n");
+               goto cleanup;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
+
+       link = bpf_program__attach(prog);
+       if (libbpf_get_error(link)) {
+               fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+               link = NULL;
+               goto cleanup;
+       }
+
+       progs_fd = bpf_object__find_map_fd_by_name(obj, "progs");
+       if (progs_fd < 0) {
+               fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+               goto cleanup;
+       }
+
+       bpf_object__for_each_program(prog, obj) {
+               title = bpf_program__title(prog, false);
+               /* register only syscalls to PROG_ARRAY */
+               if (sscanf(title, "kprobe/%d", &key) != 1)
+                       continue;
+
+               fd = bpf_program__fd(prog);
+               bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY);
        }
 
        install_accept_all_seccomp();
@@ -47,5 +94,8 @@ int main(int ac, char **argv)
 
        read_trace_pipe();
 
+cleanup:
+       bpf_link__destroy(link);
+       bpf_object__close(obj);
        return 0;
 }
index 96c234efa852fc189cd4c3a3973f8c398547140f..acad5712d8b4f564d6a61ea9ca02ad19f85df424 100644 (file)
@@ -3,24 +3,26 @@
 #include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
-struct bpf_map_def SEC("maps") counters = {
-       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-       .key_size = sizeof(int),
-       .value_size = sizeof(u32),
-       .max_entries = 64,
-};
-struct bpf_map_def SEC("maps") values = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(int),
-       .value_size = sizeof(u64),
-       .max_entries = 64,
-};
-struct bpf_map_def SEC("maps") values2 = {
-       .type = BPF_MAP_TYPE_HASH,
-       .key_size = sizeof(int),
-       .value_size = sizeof(struct bpf_perf_event_value),
-       .max_entries = 64,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(u32));
+       __uint(max_entries, 64);
+} counters SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, int);
+       __type(value, u64);
+       __uint(max_entries, 64);
+} values SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, int);
+       __type(value, struct bpf_perf_event_value);
+       __uint(max_entries, 64);
+} values2 SEC(".maps");
 
 SEC("kprobe/htab_map_get_next_key")
 int bpf_prog1(struct pt_regs *ctx)
index 4bb3c830adb283783dac07db75e2391c8eddde72..33df9784775db0c82e050f66dd3b3dac2291d170 100644 (file)
@@ -4,7 +4,6 @@
 #include <assert.h>
 #include <fcntl.h>
 #include <linux/perf_event.h>
-#include <linux/bpf.h>
 #include <sched.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
-#include "bpf_load.h"
 #include <bpf/bpf.h>
+#include <bpf/libbpf.h>
 #include "perf-sys.h"
 
 #define SAMPLE_PERIOD  0x7fffffffffffffffULL
 
+/* counters, values, values2 */
+static int map_fd[3];
+
 static void check_on_cpu(int cpu, struct perf_event_attr *attr)
 {
        struct bpf_perf_event_value value2;
@@ -174,16 +176,51 @@ static void test_bpf_perf_event(void)
 int main(int argc, char **argv)
 {
        struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+       struct bpf_link *links[2];
+       struct bpf_program *prog;
+       struct bpf_object *obj;
        char filename[256];
+       int i = 0;
+
+       setrlimit(RLIMIT_MEMLOCK, &r);
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
 
-       setrlimit(RLIMIT_MEMLOCK, &r);
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
+
+       map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters");
+       map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values");
+       map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2");
+       if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
+               fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+               goto cleanup;
+       }
+
+       bpf_object__for_each_program(prog, obj) {
+               links[i] = bpf_program__attach(prog);
+               if (libbpf_get_error(links[i])) {
+                       fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+                       links[i] = NULL;
+                       goto cleanup;
+               }
+               i++;
        }
 
        test_bpf_perf_event();
+
+cleanup:
+       for (i--; i >= 0; i--)
+               bpf_link__destroy(links[i]);
+
+       bpf_object__close(obj);
        return 0;
 }
index ea6dae78f0dff11e16a73105f830dd6cdb8ba469..fdcd6580dd736a23639e385f7da363a442064db9 100644 (file)
@@ -1,28 +1,51 @@
 #define _GNU_SOURCE
 
 #include <stdio.h>
-#include <linux/bpf.h>
 #include <unistd.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 
 int main(int argc, char **argv)
 {
-       FILE *f;
+       struct bpf_link *link = NULL;
+       struct bpf_program *prog;
+       struct bpf_object *obj;
        char filename[256];
        char command[256];
-       int ret;
+       int ret = 0;
+       FILE *f;
 
        snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               fprintf(stderr, "ERROR: opening BPF object file failed\n");
+               return 0;
+       }
+
+       prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+       if (!prog) {
+               fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+               goto cleanup;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               fprintf(stderr, "ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
 
-       if (load_bpf_file(filename)) {
-               printf("%s", bpf_log_buf);
-               return 1;
+       link = bpf_program__attach(prog);
+       if (libbpf_get_error(link)) {
+               fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+               link = NULL;
+               goto cleanup;
        }
 
        snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
        f = popen(command, "r");
        ret = pclose(f);
 
+cleanup:
+       bpf_link__destroy(link);
+       bpf_object__close(obj);
        return ret ? 0 : 1;
 }
index 9b8f21abeac475db6e101f4f3bf88c29f74d8b1a..f3468168982e6466064fab1a935ee39cea0eacfd 100644 (file)
@@ -19,9 +19,6 @@ static const char *__doc__ =
 #include <time.h>
 #include <linux/limits.h>
 
-#define __must_check
-#include <linux/err.h>
-
 #include <arpa/inet.h>
 #include <linux/if_link.h>
 
@@ -622,7 +619,7 @@ static struct bpf_link * attach_tp(struct bpf_object *obj,
        }
 
        link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
-       if (IS_ERR(link))
+       if (libbpf_get_error(link))
                exit(EXIT_FAIL_BPF);
 
        return link;
index e4d9da654e84244d04768b2aa348ff59f2b6c440..a226aee3574fd34a552825758fcac49232954b64 100644 (file)
@@ -29,8 +29,8 @@ CGROUP COMMANDS
 |      *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
 |      *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
 |              **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
-|              **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** |
-|              **getsockopt** | **setsockopt** }
+|               **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** |
+|               **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** }
 |      *ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
@@ -101,7 +101,11 @@ DESCRIPTION
                   an unconnected udp6 socket (since 5.2);
                  **sysctl** sysctl access (since 5.2);
                  **getsockopt** call to getsockopt (since 5.3);
-                 **setsockopt** call to setsockopt (since 5.3).
+                 **setsockopt** call to setsockopt (since 5.3);
+                 **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8);
+                 **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8);
+                 **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8);
+                 **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8).
 
        **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
                  Detach *PROG* from the cgroup *CGROUP* and attach type
index 5948e9d89c8d8e96872b6d9b08edf808665db180..2b254959d4880e8892e2337fa202377c695155ac 100644 (file)
@@ -41,7 +41,8 @@ PROG COMMANDS
 |              **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** |
 |              **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** |
 |              **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
-|              **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
+|              **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** |
+|               **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
 |              **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
 |              **cgroup/getsockopt** | **cgroup/setsockopt** |
 |              **struct_ops** | **fentry** | **fexit** | **freplace**
index 9f0f20e73b87a57e630942f42bcc8d612da2e000..25b25aca11120c573e829ae419d166b303ce9340 100644 (file)
@@ -472,6 +472,8 @@ _bpftool()
                                 lwt_seg6local sockops sk_skb sk_msg \
                                 lirc_mode2 cgroup/bind4 cgroup/bind6 \
                                 cgroup/connect4 cgroup/connect6 \
+                                cgroup/getpeername4 cgroup/getpeername6 \
+                                cgroup/getsockname4 cgroup/getsockname6 \
                                 cgroup/sendmsg4 cgroup/sendmsg6 \
                                 cgroup/recvmsg4 cgroup/recvmsg6 \
                                 cgroup/post_bind4 cgroup/post_bind6 \
@@ -966,9 +968,10 @@ _bpftool()
                     ;;
                 attach|detach)
                     local ATTACH_TYPES='ingress egress sock_create sock_ops \
-                        device bind4 bind6 post_bind4 post_bind6 connect4 \
-                        connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \
-                        getsockopt setsockopt'
+                        device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \
+                        getpeername4 getpeername6 getsockname4 getsockname6 \
+                        sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \
+                        setsockopt'
                     local ATTACH_FLAGS='multi override'
                     local PROG_TYPE='id pinned tag name'
                     case $prev in
@@ -977,9 +980,9 @@ _bpftool()
                             return 0
                             ;;
                         ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
-                        post_bind4|post_bind6|connect4|connect6|sendmsg4|\
-                        sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\
-                        setsockopt)
+                        post_bind4|post_bind6|connect4|connect6|getpeername4|\
+                        getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\
+                        recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt)
                             COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
                                 "$cur" ) )
                             return 0
index 1693c802bb20705be434a8fc3ab073e654511c6e..27931db421d810b0820bac1f45dbc44d4903e7ca 100644 (file)
        "       ATTACH_TYPE := { ingress | egress | sock_create |\n"           \
        "                        sock_ops | device | bind4 | bind6 |\n"        \
        "                        post_bind4 | post_bind6 | connect4 |\n"       \
-       "                        connect6 | sendmsg4 | sendmsg6 |\n"           \
-       "                        recvmsg4 | recvmsg6 | sysctl |\n"             \
-       "                        getsockopt | setsockopt }"
+       "                        connect6 | getpeername4 | getpeername6 |\n"   \
+       "                        getsockname4 | getsockname6 | sendmsg4 |\n"   \
+       "                        sendmsg6 | recvmsg4 | recvmsg6 |\n"           \
+       "                        sysctl | getsockopt | setsockopt }"
 
 static unsigned int query_flags;
 
index f89ac70ef973f7f5f3a695135b65c85093a31598..5cdf0bc049bd9e7bcc142e2201dbc5373a9ee609 100644 (file)
@@ -100,6 +100,10 @@ static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
        [BPF_CGROUP_INET6_CONNECT] = "connect6",
        [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
        [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+       [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4",
+       [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6",
+       [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4",
+       [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6",
        [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
        [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
        [BPF_CGROUP_SYSCTL] = "sysctl",
index b6e5ba568f98c5d4fa302a37ddb7d73fb305344e..245f941fdbcf1e30790f036caab4250e6ab1f57a 100644 (file)
@@ -2012,8 +2012,10 @@ static int do_help(int argc, char **argv)
                "                 sk_reuseport | flow_dissector | cgroup/sysctl |\n"
                "                 cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
                "                 cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
-               "                 cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n"
-               "                 cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n"
+               "                 cgroup/getpeername4 | cgroup/getpeername6 |\n"
+               "                 cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n"
+               "                 cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n"
+               "                 cgroup/getsockopt | cgroup/setsockopt |\n"
                "                 struct_ops | fentry | fexit | freplace }\n"
                "       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
                "                        flow_dissector }\n"
index 146c742f1d491a91be76ea5fa06c615530f6a2be..97e1fd19ff58ae0b1a69ea293958ba6353931ba4 100644 (file)
@@ -73,7 +73,7 @@ struct bpf_insn {
 /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
 struct bpf_lpm_trie_key {
        __u32   prefixlen;      /* up to 32 for AF_INET, 128 for AF_INET6 */
-       __u8    data[]; /* Arbitrary size */
+       __u8    data[0];        /* Arbitrary size */
 };
 
 struct bpf_cgroup_storage_key {
@@ -220,6 +220,10 @@ enum bpf_attach_type {
        BPF_MODIFY_RETURN,
        BPF_LSM_MAC,
        BPF_TRACE_ITER,
+       BPF_CGROUP_INET4_GETPEERNAME,
+       BPF_CGROUP_INET6_GETPEERNAME,
+       BPF_CGROUP_INET4_GETSOCKNAME,
+       BPF_CGROUP_INET6_GETSOCKNAME,
        __MAX_BPF_ATTACH_TYPE
 };
 
@@ -2015,8 +2019,8 @@ union bpf_attr {
  * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
  *     Description
  *             Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
- *             only possible to shrink the packet as of this writing,
- *             therefore *delta* must be a negative integer.
+ *             possible to both shrink and grow the packet tail.
+ *             Shrink done via *delta* being a negative integer.
  *
  *             A call to this helper is susceptible to change the underlying
  *             packet buffer. Therefore, at load time, all checks on pointers
index cffb96202e0d8e8aed4d98a1df5bde751805fa29..a405dad068f59dd6ab96a40759adc683fdcbf2c2 100644 (file)
@@ -60,7 +60,7 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn,
 void hashmap__clear(struct hashmap *map)
 {
        struct hashmap_entry *cur, *tmp;
-       int bkt;
+       size_t bkt;
 
        hashmap__for_each_entry_safe(map, cur, tmp, bkt) {
                free(cur);
@@ -100,8 +100,7 @@ static int hashmap_grow(struct hashmap *map)
        struct hashmap_entry **new_buckets;
        struct hashmap_entry *cur, *tmp;
        size_t new_cap_bits, new_cap;
-       size_t h;
-       int bkt;
+       size_t h, bkt;
 
        new_cap_bits = map->cap_bits + 1;
        if (new_cap_bits < HASHMAP_MIN_CAP_BITS)
index bae8879cdf58ae659ae8bfa71da0cf3f3877b294..e823b35e73717643c1ea50cc5b4da76aa81cbc64 100644 (file)
@@ -15,7 +15,6 @@
 #else
 #include <bits/reg.h>
 #endif
-#include "libbpf_internal.h"
 
 static inline size_t hash_bits(size_t h, int bits)
 {
index 2922579954877d999ce4f83679e29d562da51dcf..fa04cbe547ed8657391e968623b03036da3d5861 100644 (file)
@@ -6705,6 +6705,14 @@ static const struct bpf_sec_def section_defs[] = {
                                                BPF_CGROUP_UDP4_RECVMSG),
        BPF_EAPROG_SEC("cgroup/recvmsg6",       BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
                                                BPF_CGROUP_UDP6_RECVMSG),
+       BPF_EAPROG_SEC("cgroup/getpeername4",   BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+                                               BPF_CGROUP_INET4_GETPEERNAME),
+       BPF_EAPROG_SEC("cgroup/getpeername6",   BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+                                               BPF_CGROUP_INET6_GETPEERNAME),
+       BPF_EAPROG_SEC("cgroup/getsockname4",   BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+                                               BPF_CGROUP_INET4_GETSOCKNAME),
+       BPF_EAPROG_SEC("cgroup/getsockname6",   BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+                                               BPF_CGROUP_INET6_GETSOCKNAME),
        BPF_EAPROG_SEC("cgroup/sysctl",         BPF_PROG_TYPE_CGROUP_SYSCTL,
                                                BPF_CGROUP_SYSCTL),
        BPF_EAPROG_SEC("cgroup/getsockopt",     BPF_PROG_TYPE_CGROUP_SOCKOPT,
index 0f67f1b470b0834c2317ef5573a8266b79b002b3..e885d351595fc0cef47313dd1359515e0fe4ea65 100644 (file)
@@ -1,6 +1,8 @@
 ==================
 BPF Selftest Notes
 ==================
+General instructions on running selftests can be found in
+`Documentation/bpf/bpf_devel_QA.rst`_.
 
 Additional information about selftest failures are
 documented here.
index 60e3ae5d4e48006a6965b67defd453e06f014d18..2118e23ac07a8e35cd219a6a5ccd439cdf7116b5 100644 (file)
@@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y
 CONFIG_FTRACE_SYSCALLS=y
 CONFIG_IPV6_TUNNEL=y
 CONFIG_IPV6_GRE=y
+CONFIG_IPV6_SEG6_BPF=y
 CONFIG_NET_FOU=m
 CONFIG_NET_FOU_IP_TUNNELS=y
 CONFIG_IPV6_FOU=m
@@ -37,3 +38,4 @@ CONFIG_IPV6_SIT=m
 CONFIG_BPF_JIT=y
 CONFIG_BPF_LSM=y
 CONFIG_SECURITY=y
+CONFIG_LIRC=y
index 999a775484c1928f14658df6df1894427dc67c51..e36dd1a1780d813f5817f5a552eb589fdb7ee7b2 100644 (file)
@@ -5,6 +5,8 @@
 #include <string.h>
 #include <unistd.h>
 
+#include <arpa/inet.h>
+
 #include <sys/epoll.h>
 
 #include <linux/err.h>
@@ -35,7 +37,7 @@ struct ipv6_packet pkt_v6 = {
        .tcp.doff = 5,
 };
 
-int start_server(int family, int type)
+int start_server_with_port(int family, int type, __u16 port)
 {
        struct sockaddr_storage addr = {};
        socklen_t len;
@@ -45,11 +47,13 @@ int start_server(int family, int type)
                struct sockaddr_in *sin = (void *)&addr;
 
                sin->sin_family = AF_INET;
+               sin->sin_port = htons(port);
                len = sizeof(*sin);
        } else {
                struct sockaddr_in6 *sin6 = (void *)&addr;
 
                sin6->sin6_family = AF_INET6;
+               sin6->sin6_port = htons(port);
                len = sizeof(*sin6);
        }
 
@@ -76,6 +80,11 @@ int start_server(int family, int type)
        return fd;
 }
 
+int start_server(int family, int type)
+{
+       return start_server_with_port(family, type, 0);
+}
+
 static const struct timeval timeo_sec = { .tv_sec = 3 };
 static const size_t timeo_optlen = sizeof(timeo_sec);
 
index 86914e6e7b535ef71b1465175fc10f0adcc44581..6a8009605670ca84100236a66777127b1084a79a 100644 (file)
@@ -34,6 +34,7 @@ struct ipv6_packet {
 extern struct ipv6_packet pkt_v6;
 
 int start_server(int family, int type);
+int start_server_with_port(int family, int type, __u16 port);
 int connect_to_fd(int family, int type, int server_fd);
 int connect_fd_to_fd(int client_fd, int server_fd);
 int connect_wait(int client_fd);
diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
new file mode 100644 (file)
index 0000000..c548ade
--- /dev/null
@@ -0,0 +1,666 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+#define MAX_INSNS      512
+#define MAX_MATCHES    16
+
+struct bpf_reg_match {
+       unsigned int line;
+       const char *match;
+};
+
+struct bpf_align_test {
+       const char *descr;
+       struct bpf_insn insns[MAX_INSNS];
+       enum {
+               UNDEF,
+               ACCEPT,
+               REJECT
+       } result;
+       enum bpf_prog_type prog_type;
+       /* Matches must be in order of increasing line */
+       struct bpf_reg_match matches[MAX_MATCHES];
+};
+
+static struct bpf_align_test tests[] = {
+       /* Four tests of known constants.  These aren't staggeringly
+        * interesting since we track exact values now.
+        */
+       {
+               .descr = "mov",
+               .insns = {
+                       BPF_MOV64_IMM(BPF_REG_3, 2),
+                       BPF_MOV64_IMM(BPF_REG_3, 4),
+                       BPF_MOV64_IMM(BPF_REG_3, 8),
+                       BPF_MOV64_IMM(BPF_REG_3, 16),
+                       BPF_MOV64_IMM(BPF_REG_3, 32),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       {1, "R1=ctx(id=0,off=0,imm=0)"},
+                       {1, "R10=fp0"},
+                       {1, "R3_w=inv2"},
+                       {2, "R3_w=inv4"},
+                       {3, "R3_w=inv8"},
+                       {4, "R3_w=inv16"},
+                       {5, "R3_w=inv32"},
+               },
+       },
+       {
+               .descr = "shift",
+               .insns = {
+                       BPF_MOV64_IMM(BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 4),
+                       BPF_MOV64_IMM(BPF_REG_4, 32),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       {1, "R1=ctx(id=0,off=0,imm=0)"},
+                       {1, "R10=fp0"},
+                       {1, "R3_w=inv1"},
+                       {2, "R3_w=inv2"},
+                       {3, "R3_w=inv4"},
+                       {4, "R3_w=inv8"},
+                       {5, "R3_w=inv16"},
+                       {6, "R3_w=inv1"},
+                       {7, "R4_w=inv32"},
+                       {8, "R4_w=inv16"},
+                       {9, "R4_w=inv8"},
+                       {10, "R4_w=inv4"},
+                       {11, "R4_w=inv2"},
+               },
+       },
+       {
+               .descr = "addsub",
+               .insns = {
+                       BPF_MOV64_IMM(BPF_REG_3, 4),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 4),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 2),
+                       BPF_MOV64_IMM(BPF_REG_4, 8),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       {1, "R1=ctx(id=0,off=0,imm=0)"},
+                       {1, "R10=fp0"},
+                       {1, "R3_w=inv4"},
+                       {2, "R3_w=inv8"},
+                       {3, "R3_w=inv10"},
+                       {4, "R4_w=inv8"},
+                       {5, "R4_w=inv12"},
+                       {6, "R4_w=inv14"},
+               },
+       },
+       {
+               .descr = "mul",
+               .insns = {
+                       BPF_MOV64_IMM(BPF_REG_3, 7),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 2),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 4),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       {1, "R1=ctx(id=0,off=0,imm=0)"},
+                       {1, "R10=fp0"},
+                       {1, "R3_w=inv7"},
+                       {2, "R3_w=inv7"},
+                       {3, "R3_w=inv14"},
+                       {4, "R3_w=inv56"},
+               },
+       },
+
+       /* Tests using unknown values */
+#define PREP_PKT_POINTERS \
+       BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \
+                   offsetof(struct __sk_buff, data)), \
+       BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \
+                   offsetof(struct __sk_buff, data_end))
+
+#define LOAD_UNKNOWN(DST_REG) \
+       PREP_PKT_POINTERS, \
+       BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), \
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), \
+       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 1), \
+       BPF_EXIT_INSN(), \
+       BPF_LDX_MEM(BPF_B, DST_REG, BPF_REG_2, 0)
+
+       {
+               .descr = "unknown shift",
+               .insns = {
+                       LOAD_UNKNOWN(BPF_REG_3),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+                       LOAD_UNKNOWN(BPF_REG_4),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
+                       {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+                       {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+                       {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+                       {18, "R3=pkt_end(id=0,off=0,imm=0)"},
+                       {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
+                       {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+                       {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+                       {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+               },
+       },
+       {
+               .descr = "unknown mul",
+               .insns = {
+                       LOAD_UNKNOWN(BPF_REG_3),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 1),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 4),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 8),
+                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+                       {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+                       {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+               },
+       },
+       {
+               .descr = "packet const offset",
+               .insns = {
+                       PREP_PKT_POINTERS,
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+
+                       /* Skip over ethernet header.  */
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+
+                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 0),
+                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 1),
+                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 2),
+                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 3),
+                       BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 0),
+                       BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 2),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
+                       {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
+                       {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
+                       {10, "R2=pkt(id=0,off=0,r=18,imm=0)"},
+                       {10, "R5=pkt(id=0,off=14,r=18,imm=0)"},
+                       {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
+                       {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
+               },
+       },
+       {
+               .descr = "packet variable offset",
+               .insns = {
+                       LOAD_UNKNOWN(BPF_REG_6),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+
+                       /* First, add a constant to the R5 packet pointer,
+                        * then a variable with a known alignment.
+                        */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+                       /* Now, test in the other direction.  Adding first
+                        * the variable offset to R5, then the constant.
+                        */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+                       /* Test multiple accumulations of unknown values
+                        * into a packet pointer.
+                        */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       /* Calculated offset in R6 has unknown value, but known
+                        * alignment of 4.
+                        */
+                       {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+                       {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Offset is added to packet pointer R5, resulting in
+                        * known fixed offset, and variable offset from R6.
+                        */
+                       {11, "R5_w=pkt(id=1,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* At the time the word size load is performed from R5,
+                        * it's total offset is NET_IP_ALIGN + reg->off (0) +
+                        * reg->aux_off (14) which is 16.  Then the variable
+                        * offset is considered using reg->aux_off_align which
+                        * is 4 and meets the load's requirements.
+                        */
+                       {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Variable offset is added to R5 packet pointer,
+                        * resulting in auxiliary alignment of 4.
+                        */
+                       {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Constant offset is added to R5, resulting in
+                        * reg->off of 14.
+                        */
+                       {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off
+                        * (14) which is 16.  Then the variable offset is 4-byte
+                        * aligned, so the total offset is 4-byte aligned and
+                        * meets the load's requirements.
+                        */
+                       {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Constant offset is added to R5 packet pointer,
+                        * resulting in reg->off value of 14.
+                        */
+                       {26, "R5_w=pkt(id=0,off=14,r=8"},
+                       /* Variable offset is added to R5, resulting in a
+                        * variable offset of (4n).
+                        */
+                       {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Constant is added to R5 again, setting reg->off to 18. */
+                       {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* And once more we add a variable; resulting var_off
+                        * is still (4n), fixed offset is not changed.
+                        * Also, we create a new reg->id.
+                        */
+                       {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off (18)
+                        * which is 20.  Then the variable offset is (4n), so
+                        * the total offset is 4-byte aligned and meets the
+                        * load's requirements.
+                        */
+                       {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+                       {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+               },
+       },
+       {
+               .descr = "packet variable offset 2",
+               .insns = {
+                       /* Create an unknown offset, (4n+2)-aligned */
+                       LOAD_UNKNOWN(BPF_REG_6),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+                       /* Add it to the packet pointer */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       /* Check bounds and perform a read */
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+                       /* Make a (4n) offset from the value we just read */
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xff),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+                       /* Add it to the packet pointer */
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       /* Check bounds and perform a read */
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       /* Calculated offset in R6 has unknown value, but known
+                        * alignment of 4.
+                        */
+                       {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+                       {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Adding 14 makes R6 be (4n+2) */
+                       {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+                       /* Packet pointer has (4n+2) offset */
+                       {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+                       {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+                        * which is 2.  Then the variable offset is (4n+2), so
+                        * the total offset is 4-byte aligned and meets the
+                        * load's requirements.
+                        */
+                       {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+                       /* Newly read value in R6 was shifted left by 2, so has
+                        * known alignment of 4.
+                        */
+                       {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Added (4n) to packet pointer's (4n+2) var_off, giving
+                        * another (4n+2).
+                        */
+                       {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+                       {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+                        * which is 2.  Then the variable offset is (4n+2), so
+                        * the total offset is 4-byte aligned and meets the
+                        * load's requirements.
+                        */
+                       {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+               },
+       },
+       {
+               .descr = "dubious pointer arithmetic",
+               .insns = {
+                       PREP_PKT_POINTERS,
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       /* (ptr - ptr) << 2 */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+                       BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2),
+                       /* We have a (4n) value.  Let's make a packet offset
+                        * out of it.  First add 14, to make it a (4n+2)
+                        */
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+                       /* Then make sure it's nonnegative */
+                       BPF_JMP_IMM(BPF_JSGE, BPF_REG_5, 0, 1),
+                       BPF_EXIT_INSN(),
+                       /* Add it to packet pointer */
+                       BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+                       /* Check bounds and perform a read */
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_6),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_6, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .result = REJECT,
+               .matches = {
+                       {4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
+                       /* (ptr - ptr) << 2 == unknown, (4n) */
+                       {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"},
+                       /* (4n) + 14 == (4n+2).  We blow our bounds, because
+                        * the add could overflow.
+                        */
+                       {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+                       /* Checked s>=0 */
+                       {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+                       /* packet pointer + nonnegative (4n+2) */
+                       {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+                       {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+                       /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
+                        * We checked the bounds, but it might have been able
+                        * to overflow if the packet pointer started in the
+                        * upper half of the address space.
+                        * So we did not get a 'range' on R6, and the access
+                        * attempt will fail.
+                        */
+                       {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
+               }
+       },
+       {
+               .descr = "variable subtraction",
+               .insns = {
+                       /* Create an unknown offset, (4n+2)-aligned */
+                       LOAD_UNKNOWN(BPF_REG_6),
+                       BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+                       /* Create another unknown, (4n)-aligned, and subtract
+                        * it from the first one
+                        */
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
+                       BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_7),
+                       /* Bounds-check the result */
+                       BPF_JMP_IMM(BPF_JSGE, BPF_REG_6, 0, 1),
+                       BPF_EXIT_INSN(),
+                       /* Add it to the packet pointer */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+                       /* Check bounds and perform a read */
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       /* Calculated offset in R6 has unknown value, but known
+                        * alignment of 4.
+                        */
+                       {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+                       {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Adding 14 makes R6 be (4n+2) */
+                       {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+                       /* New unknown value in R7 is (4n) */
+                       {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+                       /* Subtracting it from R6 blows our unsigned bounds */
+                       {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+                       /* Checked s>= 0 */
+                       {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+                        * which is 2.  Then the variable offset is (4n+2), so
+                        * the total offset is 4-byte aligned and meets the
+                        * load's requirements.
+                        */
+                       {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
+
+               },
+       },
+       {
+               .descr = "pointer variable subtraction",
+               .insns = {
+                       /* Create an unknown offset, (4n+2)-aligned and bounded
+                        * to [14,74]
+                        */
+                       LOAD_UNKNOWN(BPF_REG_6),
+                       BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xf),
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+                       /* Subtract it from the packet pointer */
+                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+                       BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_6),
+                       /* Create another unknown, (4n)-aligned and >= 74.
+                        * That in fact means >= 76, since 74 % 4 == 2
+                        */
+                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 76),
+                       /* Add it to the packet pointer */
+                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_7),
+                       /* Check bounds and perform a read */
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+                       BPF_EXIT_INSN(),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+               .matches = {
+                       /* Calculated offset in R6 has unknown value, but known
+                        * alignment of 4.
+                        */
+                       {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+                       {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
+                       /* Adding 14 makes R6 be (4n+2) */
+                       {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
+                       /* Subtracting from packet pointer overflows ubounds */
+                       {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
+                       /* New unknown value in R7 is (4n), >= 76 */
+                       {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
+                       /* Adding it to packet pointer gives nice bounds again */
+                       {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+                       /* At the time the word size load is performed from R5,
+                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+                        * which is 2.  Then the variable offset is (4n+2), so
+                        * the total offset is 4-byte aligned and meets the
+                        * load's requirements.
+                        */
+                       {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+               },
+       },
+};
+
+static int probe_filter_length(const struct bpf_insn *fp)
+{
+       int len;
+
+       for (len = MAX_INSNS - 1; len > 0; --len)
+               if (fp[len].code != 0 || fp[len].imm != 0)
+                       break;
+       return len + 1;
+}
+
+static char bpf_vlog[32768];
+
+static int do_test_single(struct bpf_align_test *test)
+{
+       struct bpf_insn *prog = test->insns;
+       int prog_type = test->prog_type;
+       char bpf_vlog_copy[32768];
+       const char *line_ptr;
+       int cur_line = -1;
+       int prog_len, i;
+       int fd_prog;
+       int ret;
+
+       prog_len = probe_filter_length(prog);
+       fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
+                                    prog, prog_len, BPF_F_STRICT_ALIGNMENT,
+                                    "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
+       if (fd_prog < 0 && test->result != REJECT) {
+               printf("Failed to load program.\n");
+               printf("%s", bpf_vlog);
+               ret = 1;
+       } else if (fd_prog >= 0 && test->result == REJECT) {
+               printf("Unexpected success to load!\n");
+               printf("%s", bpf_vlog);
+               ret = 1;
+               close(fd_prog);
+       } else {
+               ret = 0;
+               /* We make a local copy so that we can strtok() it */
+               strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy));
+               line_ptr = strtok(bpf_vlog_copy, "\n");
+               for (i = 0; i < MAX_MATCHES; i++) {
+                       struct bpf_reg_match m = test->matches[i];
+
+                       if (!m.match)
+                               break;
+                       while (line_ptr) {
+                               cur_line = -1;
+                               sscanf(line_ptr, "%u: ", &cur_line);
+                               if (cur_line == m.line)
+                                       break;
+                               line_ptr = strtok(NULL, "\n");
+                       }
+                       if (!line_ptr) {
+                               printf("Failed to find line %u for match: %s\n",
+                                      m.line, m.match);
+                               ret = 1;
+                               printf("%s", bpf_vlog);
+                               break;
+                       }
+                       if (!strstr(line_ptr, m.match)) {
+                               printf("Failed to find match %u: %s\n",
+                                      m.line, m.match);
+                               ret = 1;
+                               printf("%s", bpf_vlog);
+                               break;
+                       }
+               }
+               if (fd_prog >= 0)
+                       close(fd_prog);
+       }
+       return ret;
+}
+
+void test_align(void)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(tests); i++) {
+               struct bpf_align_test *test = &tests[i];
+
+               if (!test__start_subtest(test->descr))
+                       continue;
+
+               CHECK_FAIL(do_test_single(test));
+       }
+}
index 47fbb20cb6a6651027413c573c6793c14cade0de..17bbf76812ca5887951cd865cb79dcb5d8712ca7 100644 (file)
@@ -4,7 +4,8 @@
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
 
-static int verify_port(int family, int fd, int expected)
+static int verify_ports(int family, int fd,
+                       __u16 expected_local, __u16 expected_peer)
 {
        struct sockaddr_storage addr;
        socklen_t len = sizeof(addr);
@@ -20,9 +21,25 @@ static int verify_port(int family, int fd, int expected)
        else
                port = ((struct sockaddr_in6 *)&addr)->sin6_port;
 
-       if (ntohs(port) != expected) {
-               log_err("Unexpected port %d, expected %d", ntohs(port),
-                       expected);
+       if (ntohs(port) != expected_local) {
+               log_err("Unexpected local port %d, expected %d", ntohs(port),
+                       expected_local);
+               return -1;
+       }
+
+       if (getpeername(fd, (struct sockaddr *)&addr, &len)) {
+               log_err("Failed to get peer addr");
+               return -1;
+       }
+
+       if (family == AF_INET)
+               port = ((struct sockaddr_in *)&addr)->sin_port;
+       else
+               port = ((struct sockaddr_in6 *)&addr)->sin6_port;
+
+       if (ntohs(port) != expected_peer) {
+               log_err("Unexpected peer port %d, expected %d", ntohs(port),
+                       expected_peer);
                return -1;
        }
 
@@ -31,33 +48,67 @@ static int verify_port(int family, int fd, int expected)
 
 static int run_test(int cgroup_fd, int server_fd, int family, int type)
 {
+       bool v4 = family == AF_INET;
+       __u16 expected_local_port = v4 ? 22222 : 22223;
+       __u16 expected_peer_port = 60000;
        struct bpf_prog_load_attr attr = {
-               .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+               .file = v4 ? "./connect_force_port4.o" :
+                            "./connect_force_port6.o",
        };
+       struct bpf_program *prog;
        struct bpf_object *obj;
-       int expected_port;
-       int prog_fd;
-       int err;
-       int fd;
-
-       if (family == AF_INET) {
-               attr.file = "./connect_force_port4.o";
-               attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
-               expected_port = 22222;
-       } else {
-               attr.file = "./connect_force_port6.o";
-               attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT;
-               expected_port = 22223;
-       }
+       int xlate_fd, fd, err;
+       __u32 duration = 0;
 
-       err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+       err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd);
        if (err) {
                log_err("Failed to load BPF object");
                return -1;
        }
 
-       err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type,
-                             0);
+       prog = bpf_object__find_program_by_title(obj, v4 ?
+                                                "cgroup/connect4" :
+                                                "cgroup/connect6");
+       if (CHECK(!prog, "find_prog", "connect prog not found\n")) {
+               err = -EIO;
+               goto close_bpf_object;
+       }
+
+       err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+                             BPF_CGROUP_INET4_CONNECT :
+                             BPF_CGROUP_INET6_CONNECT, 0);
+       if (err) {
+               log_err("Failed to attach BPF program");
+               goto close_bpf_object;
+       }
+
+       prog = bpf_object__find_program_by_title(obj, v4 ?
+                                                "cgroup/getpeername4" :
+                                                "cgroup/getpeername6");
+       if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) {
+               err = -EIO;
+               goto close_bpf_object;
+       }
+
+       err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+                             BPF_CGROUP_INET4_GETPEERNAME :
+                             BPF_CGROUP_INET6_GETPEERNAME, 0);
+       if (err) {
+               log_err("Failed to attach BPF program");
+               goto close_bpf_object;
+       }
+
+       prog = bpf_object__find_program_by_title(obj, v4 ?
+                                                "cgroup/getsockname4" :
+                                                "cgroup/getsockname6");
+       if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) {
+               err = -EIO;
+               goto close_bpf_object;
+       }
+
+       err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+                             BPF_CGROUP_INET4_GETSOCKNAME :
+                             BPF_CGROUP_INET6_GETSOCKNAME, 0);
        if (err) {
                log_err("Failed to attach BPF program");
                goto close_bpf_object;
@@ -69,8 +120,8 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
                goto close_bpf_object;
        }
 
-       err = verify_port(family, fd, expected_port);
-
+       err = verify_ports(family, fd, expected_local_port,
+                          expected_peer_port);
        close(fd);
 
 close_bpf_object:
@@ -86,25 +137,25 @@ void test_connect_force_port(void)
        if (CHECK_FAIL(cgroup_fd < 0))
                return;
 
-       server_fd = start_server(AF_INET, SOCK_STREAM);
+       server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123);
        if (CHECK_FAIL(server_fd < 0))
                goto close_cgroup_fd;
        CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM));
        close(server_fd);
 
-       server_fd = start_server(AF_INET6, SOCK_STREAM);
+       server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124);
        if (CHECK_FAIL(server_fd < 0))
                goto close_cgroup_fd;
        CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM));
        close(server_fd);
 
-       server_fd = start_server(AF_INET, SOCK_DGRAM);
+       server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123);
        if (CHECK_FAIL(server_fd < 0))
                goto close_cgroup_fd;
        CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM));
        close(server_fd);
 
-       server_fd = start_server(AF_INET6, SOCK_DGRAM);
+       server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124);
        if (CHECK_FAIL(server_fd < 0))
                goto close_cgroup_fd;
        CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM));
index 4867cd3445c8dc99176739495f78c89538c836df..b57bd6fef2084b6d3ce795cd57ade14fe92608ce 100644 (file)
@@ -1,11 +1,27 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__bpf_map
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map {
+       struct bpf_iter_meta *meta;
+       struct bpf_map *map;
+} __attribute__((preserve_access_index));
+
 SEC("iter/bpf_map")
 int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
 {
index ab9e2650e0210eb9e1c493b747bddf0345d7fd3b..c8e9ca74c87b5fef02776a98396bb456cce394c8 100644 (file)
@@ -1,9 +1,25 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__ipv6_route
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__ipv6_route {
+       struct bpf_iter_meta *meta;
+       struct fib6_info *rt;
+} __attribute__((preserve_access_index));
+
 char _license[] SEC("license") = "GPL";
 
 extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
index 6b40a233d4e06b6b4de7702b1fe53908a3bbe720..e7b8753eac0b14e73bad2f95aa6c3b82f8efe645 100644 (file)
@@ -1,6 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__netlink bpf_iter__netlink___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__netlink
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
@@ -9,6 +14,17 @@ char _license[] SEC("license") = "GPL";
 #define sk_rmem_alloc  sk_backlog.rmem_alloc
 #define sk_refcnt      __sk_common.skc_refcnt
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__netlink {
+       struct bpf_iter_meta *meta;
+       struct netlink_sock *sk;
+} __attribute__((preserve_access_index));
+
 static inline struct inode *SOCK_INODE(struct socket *socket)
 {
        return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
index 90f9011c57ca53811c79579983d159ba1d121087..ee754021f98edf777b8ac796c952df60845a1581 100644 (file)
@@ -1,11 +1,27 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+       struct bpf_iter_meta *meta;
+       struct task_struct *task;
+} __attribute__((preserve_access_index));
+
 SEC("iter/task")
 int dump_task(struct bpf_iter__task *ctx)
 {
index c6ced38f0880da691424a28c32783e857ccf2ca0..0f0ec3db20ba098330b710a120b960fd141d16bf 100644 (file)
@@ -1,11 +1,29 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task_file bpf_iter__task_file___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task_file
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task_file {
+       struct bpf_iter_meta *meta;
+       struct task_struct *task;
+       __u32 fd;
+       struct file *file;
+} __attribute__((preserve_access_index));
+
 SEC("iter/task_file")
 int dump_task_file(struct bpf_iter__task_file *ctx)
 {
index 636a00fa074d0c458b6c85ed440bfaf0fbf26e54..13c2c90c835f9ece1d7b257aa7cdf4735741d11b 100644 (file)
@@ -1,10 +1,25 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+       struct bpf_iter_meta *meta;
+       struct task_struct *task;
+} __attribute__((preserve_access_index));
+
 SEC("iter/task")
 int dump_task(struct bpf_iter__task *ctx)
 {
index b18dc0471d0775357e2d61bcab94052878f48124..0aa71b333cf364edcfe6d6e5165d6d9fb0773a5d 100644 (file)
@@ -1,10 +1,25 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__bpf_map
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map {
+       struct bpf_iter_meta *meta;
+       struct bpf_map *map;
+} __attribute__((preserve_access_index));
+
 __u32 map1_id = 0, map2_id = 0;
 __u32 map1_accessed = 0, map2_accessed = 0;
 __u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0;
index bdd51cf14b54d8d4855f505cac06f5b370394800..dee1339e690571521dfdd6b172b370701952f6b5 100644 (file)
@@ -1,11 +1,27 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
 #include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
 int count = 0;
 
+struct bpf_iter_meta {
+       struct seq_file *seq;
+       __u64 session_id;
+       __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+       struct bpf_iter_meta *meta;
+       struct task_struct *task;
+} __attribute__((preserve_access_index));
+
 SEC("iter/task")
 int dump_task(struct bpf_iter__task *ctx)
 {
index 1b8eb34b2db0c98f338799e745e5c997dcedff6f..7396308677a307b215e5a4e4cf8d1b8c50c21397 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <string.h>
+#include <stdbool.h>
 
 #include <linux/bpf.h>
 #include <linux/in.h>
 char _license[] SEC("license") = "GPL";
 int _version SEC("version") = 1;
 
+struct svc_addr {
+       __be32 addr;
+       __be16 port;
+};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
 SEC("cgroup/connect4")
-int _connect4(struct bpf_sock_addr *ctx)
+int connect4(struct bpf_sock_addr *ctx)
 {
        struct sockaddr_in sa = {};
+       struct svc_addr *orig;
 
+       /* Force local address to 127.0.0.1:22222. */
        sa.sin_family = AF_INET;
        sa.sin_port = bpf_htons(22222);
-       sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */
+       sa.sin_addr.s_addr = bpf_htonl(0x7f000001);
 
        if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
                return 0;
 
+       /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */
+       if (ctx->user_port == bpf_htons(60000)) {
+               orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+                                         BPF_SK_STORAGE_GET_F_CREATE);
+               if (!orig)
+                       return 0;
+
+               orig->addr = ctx->user_ip4;
+               orig->port = ctx->user_port;
+
+               ctx->user_ip4 = bpf_htonl(0x7f000001);
+               ctx->user_port = bpf_htons(60123);
+       }
+       return 1;
+}
+
+SEC("cgroup/getsockname4")
+int getsockname4(struct bpf_sock_addr *ctx)
+{
+       /* Expose local server as 1.2.3.4:60000 to client. */
+       if (ctx->user_port == bpf_htons(60123)) {
+               ctx->user_ip4 = bpf_htonl(0x01020304);
+               ctx->user_port = bpf_htons(60000);
+       }
+       return 1;
+}
+
+SEC("cgroup/getpeername4")
+int getpeername4(struct bpf_sock_addr *ctx)
+{
+       struct svc_addr *orig;
+
+       /* Expose service 1.2.3.4:60000 as peer instead of backend. */
+       if (ctx->user_port == bpf_htons(60123)) {
+               orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+               if (orig) {
+                       ctx->user_ip4 = orig->addr;
+                       ctx->user_port = orig->port;
+               }
+       }
        return 1;
 }
index ae6f7d750b4cfc17549c005656de5aee6183d51f..c1a2b555e9ad1f9e8a4ac62fe5798cb42ad2890e 100644 (file)
 char _license[] SEC("license") = "GPL";
 int _version SEC("version") = 1;
 
+struct svc_addr {
+       __be32 addr[4];
+       __be16 port;
+};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
 SEC("cgroup/connect6")
-int _connect6(struct bpf_sock_addr *ctx)
+int connect6(struct bpf_sock_addr *ctx)
 {
        struct sockaddr_in6 sa = {};
+       struct svc_addr *orig;
 
+       /* Force local address to [::1]:22223. */
        sa.sin6_family = AF_INET6;
        sa.sin6_port = bpf_htons(22223);
-       sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */
+       sa.sin6_addr.s6_addr32[3] = bpf_htonl(1);
 
        if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
                return 0;
 
+       /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */
+       if (ctx->user_port == bpf_htons(60000)) {
+               orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+                                         BPF_SK_STORAGE_GET_F_CREATE);
+               if (!orig)
+                       return 0;
+
+               orig->addr[0] = ctx->user_ip6[0];
+               orig->addr[1] = ctx->user_ip6[1];
+               orig->addr[2] = ctx->user_ip6[2];
+               orig->addr[3] = ctx->user_ip6[3];
+               orig->port = ctx->user_port;
+
+               ctx->user_ip6[0] = 0;
+               ctx->user_ip6[1] = 0;
+               ctx->user_ip6[2] = 0;
+               ctx->user_ip6[3] = bpf_htonl(1);
+               ctx->user_port = bpf_htons(60124);
+       }
+       return 1;
+}
+
+SEC("cgroup/getsockname6")
+int getsockname6(struct bpf_sock_addr *ctx)
+{
+       /* Expose local server as [fc00::1]:60000 to client. */
+       if (ctx->user_port == bpf_htons(60124)) {
+               ctx->user_ip6[0] = bpf_htonl(0xfc000000);
+               ctx->user_ip6[1] = 0;
+               ctx->user_ip6[2] = 0;
+               ctx->user_ip6[3] = bpf_htonl(1);
+               ctx->user_port = bpf_htons(60000);
+       }
+       return 1;
+}
+
+SEC("cgroup/getpeername6")
+int getpeername6(struct bpf_sock_addr *ctx)
+{
+       struct svc_addr *orig;
+
+       /* Expose service [fc00::1]:60000 as peer instead of backend. */
+       if (ctx->user_port == bpf_htons(60124)) {
+               orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+               if (orig) {
+                       ctx->user_ip6[0] = orig->addr[0];
+                       ctx->user_ip6[1] = orig->addr[1];
+                       ctx->user_ip6[2] = orig->addr[2];
+                       ctx->user_ip6[3] = orig->addr[3];
+                       ctx->user_port = orig->port;
+               }
+       }
        return 1;
 }
index d2b38fa6a5b0fccc54829e20ee499fa066ea3b25..e83d0b48d80ca71624c69f95a214bf0fa4ad25bc 100644 (file)
@@ -73,6 +73,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb)
 
        tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
        sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+       bpf_printk("sk=%d\n", sk ? 1 : 0);
        if (sk)
                bpf_sk_release(sk);
        return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
new file mode 100644 (file)
index 0000000..a443d36
--- /dev/null
@@ -0,0 +1,299 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* Sockmap sample program connects a client and a backend together
+ * using cgroups.
+ *
+ *    client:X <---> frontend:80 client:X <---> backend:80
+ *
+ * For simplicity we hard code values here and bind 1:1. The hard
+ * coded values are part of the setup in sockmap.sh script that
+ * is associated with this BPF program.
+ *
+ * The bpf_printk is verbose and prints information as connections
+ * are established and verdicts are decided.
+ */
+
+struct {
+       __uint(type, TEST_MAP_TYPE);
+       __uint(max_entries, 20);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(int));
+} sock_map SEC(".maps");
+
+struct {
+       __uint(type, TEST_MAP_TYPE);
+       __uint(max_entries, 20);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(int));
+} sock_map_txmsg SEC(".maps");
+
+struct {
+       __uint(type, TEST_MAP_TYPE);
+       __uint(max_entries, 20);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(int));
+} sock_map_redir SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, int);
+       __type(value, int);
+} sock_apply_bytes SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, int);
+       __type(value, int);
+} sock_cork_bytes SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 6);
+       __type(key, int);
+       __type(value, int);
+} sock_bytes SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, int);
+       __type(value, int);
+} sock_redir_flags SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, int);
+       __type(value, int);
+} sock_skb_opts SEC(".maps");
+
+SEC("sk_skb1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+       return skb->len;
+}
+
+SEC("sk_skb2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+       __u32 lport = skb->local_port;
+       __u32 rport = skb->remote_port;
+       int len, *f, ret, zero = 0;
+       __u64 flags = 0;
+
+       if (lport == 10000)
+               ret = 10;
+       else
+               ret = 1;
+
+       len = (__u32)skb->data_end - (__u32)skb->data;
+       f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
+       if (f && *f) {
+               ret = 3;
+               flags = *f;
+       }
+
+#ifdef SOCKMAP
+       return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
+#else
+       return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags);
+#endif
+
+}
+
+SEC("sockops")
+int bpf_sockmap(struct bpf_sock_ops *skops)
+{
+       __u32 lport, rport;
+       int op, err = 0, index, key, ret;
+
+
+       op = (int) skops->op;
+
+       switch (op) {
+       case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+               lport = skops->local_port;
+               rport = skops->remote_port;
+
+               if (lport == 10000) {
+                       ret = 1;
+#ifdef SOCKMAP
+                       err = bpf_sock_map_update(skops, &sock_map, &ret,
+                                                 BPF_NOEXIST);
+#else
+                       err = bpf_sock_hash_update(skops, &sock_map, &ret,
+                                                  BPF_NOEXIST);
+#endif
+               }
+               break;
+       case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+               lport = skops->local_port;
+               rport = skops->remote_port;
+
+               if (bpf_ntohl(rport) == 10001) {
+                       ret = 10;
+#ifdef SOCKMAP
+                       err = bpf_sock_map_update(skops, &sock_map, &ret,
+                                                 BPF_NOEXIST);
+#else
+                       err = bpf_sock_hash_update(skops, &sock_map, &ret,
+                                                  BPF_NOEXIST);
+#endif
+               }
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+SEC("sk_msg1")
+int bpf_prog4(struct sk_msg_md *msg)
+{
+       int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+       int *start, *end, *start_push, *end_push, *start_pop, *pop;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               bpf_msg_cork_bytes(msg, *bytes);
+       start = bpf_map_lookup_elem(&sock_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_bytes, &one);
+       if (start && end)
+               bpf_msg_pull_data(msg, *start, *end, 0);
+       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+       if (start_push && end_push)
+               bpf_msg_push_data(msg, *start_push, *end_push, 0);
+       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+       pop = bpf_map_lookup_elem(&sock_bytes, &five);
+       if (start_pop && pop)
+               bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+       return SK_PASS;
+}
+
+SEC("sk_msg2")
+int bpf_prog6(struct sk_msg_md *msg)
+{
+       int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
+       int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
+       __u64 flags = 0;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               bpf_msg_cork_bytes(msg, *bytes);
+
+       start = bpf_map_lookup_elem(&sock_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_bytes, &one);
+       if (start && end)
+               bpf_msg_pull_data(msg, *start, *end, 0);
+
+       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+       if (start_push && end_push)
+               bpf_msg_push_data(msg, *start_push, *end_push, 0);
+
+       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+       pop = bpf_map_lookup_elem(&sock_bytes, &five);
+       if (start_pop && pop)
+               bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+
+       f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+       if (f && *f) {
+               key = 2;
+               flags = *f;
+       }
+#ifdef SOCKMAP
+       return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+#else
+       return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
+#endif
+}
+
+SEC("sk_msg3")
+int bpf_prog8(struct sk_msg_md *msg)
+{
+       void *data_end = (void *)(long) msg->data_end;
+       void *data = (void *)(long) msg->data;
+       int ret = 0, *bytes, zero = 0;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes) {
+               ret = bpf_msg_apply_bytes(msg, *bytes);
+               if (ret)
+                       return SK_DROP;
+       } else {
+               return SK_DROP;
+       }
+       return SK_PASS;
+}
+SEC("sk_msg4")
+int bpf_prog9(struct sk_msg_md *msg)
+{
+       void *data_end = (void *)(long) msg->data_end;
+       void *data = (void *)(long) msg->data;
+       int ret = 0, *bytes, zero = 0;
+
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes) {
+               if (((__u64)data_end - (__u64)data) >= *bytes)
+                       return SK_PASS;
+               ret = bpf_msg_cork_bytes(msg, *bytes);
+               if (ret)
+                       return SK_DROP;
+       }
+       return SK_PASS;
+}
+
+SEC("sk_msg5")
+int bpf_prog10(struct sk_msg_md *msg)
+{
+       int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
+       int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               bpf_msg_cork_bytes(msg, *bytes);
+       start = bpf_map_lookup_elem(&sock_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_bytes, &one);
+       if (start && end)
+               bpf_msg_pull_data(msg, *start, *end, 0);
+       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+       if (start_push && end_push)
+               bpf_msg_push_data(msg, *start_push, *end_push, 0);
+       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+       pop = bpf_map_lookup_elem(&sock_bytes, &five);
+       if (start_pop && pop)
+               bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+       return SK_DROP;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
deleted file mode 100644 (file)
index 0262f7b..0000000
+++ /dev/null
@@ -1,719 +0,0 @@
-#include <asm/types.h>
-#include <linux/types.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <stddef.h>
-#include <stdbool.h>
-
-#include <linux/unistd.h>
-#include <linux/filter.h>
-#include <linux/bpf_perf_event.h>
-#include <linux/bpf.h>
-
-#include <bpf/bpf.h>
-
-#include "../../../include/linux/filter.h"
-#include "bpf_rlimit.h"
-#include "bpf_util.h"
-
-#define MAX_INSNS      512
-#define MAX_MATCHES    16
-
-struct bpf_reg_match {
-       unsigned int line;
-       const char *match;
-};
-
-struct bpf_align_test {
-       const char *descr;
-       struct bpf_insn insns[MAX_INSNS];
-       enum {
-               UNDEF,
-               ACCEPT,
-               REJECT
-       } result;
-       enum bpf_prog_type prog_type;
-       /* Matches must be in order of increasing line */
-       struct bpf_reg_match matches[MAX_MATCHES];
-};
-
-static struct bpf_align_test tests[] = {
-       /* Four tests of known constants.  These aren't staggeringly
-        * interesting since we track exact values now.
-        */
-       {
-               .descr = "mov",
-               .insns = {
-                       BPF_MOV64_IMM(BPF_REG_3, 2),
-                       BPF_MOV64_IMM(BPF_REG_3, 4),
-                       BPF_MOV64_IMM(BPF_REG_3, 8),
-                       BPF_MOV64_IMM(BPF_REG_3, 16),
-                       BPF_MOV64_IMM(BPF_REG_3, 32),
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       {1, "R1=ctx(id=0,off=0,imm=0)"},
-                       {1, "R10=fp0"},
-                       {1, "R3_w=inv2"},
-                       {2, "R3_w=inv4"},
-                       {3, "R3_w=inv8"},
-                       {4, "R3_w=inv16"},
-                       {5, "R3_w=inv32"},
-               },
-       },
-       {
-               .descr = "shift",
-               .insns = {
-                       BPF_MOV64_IMM(BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 4),
-                       BPF_MOV64_IMM(BPF_REG_4, 32),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       {1, "R1=ctx(id=0,off=0,imm=0)"},
-                       {1, "R10=fp0"},
-                       {1, "R3_w=inv1"},
-                       {2, "R3_w=inv2"},
-                       {3, "R3_w=inv4"},
-                       {4, "R3_w=inv8"},
-                       {5, "R3_w=inv16"},
-                       {6, "R3_w=inv1"},
-                       {7, "R4_w=inv32"},
-                       {8, "R4_w=inv16"},
-                       {9, "R4_w=inv8"},
-                       {10, "R4_w=inv4"},
-                       {11, "R4_w=inv2"},
-               },
-       },
-       {
-               .descr = "addsub",
-               .insns = {
-                       BPF_MOV64_IMM(BPF_REG_3, 4),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 4),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 2),
-                       BPF_MOV64_IMM(BPF_REG_4, 8),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2),
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       {1, "R1=ctx(id=0,off=0,imm=0)"},
-                       {1, "R10=fp0"},
-                       {1, "R3_w=inv4"},
-                       {2, "R3_w=inv8"},
-                       {3, "R3_w=inv10"},
-                       {4, "R4_w=inv8"},
-                       {5, "R4_w=inv12"},
-                       {6, "R4_w=inv14"},
-               },
-       },
-       {
-               .descr = "mul",
-               .insns = {
-                       BPF_MOV64_IMM(BPF_REG_3, 7),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 2),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 4),
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       {1, "R1=ctx(id=0,off=0,imm=0)"},
-                       {1, "R10=fp0"},
-                       {1, "R3_w=inv7"},
-                       {2, "R3_w=inv7"},
-                       {3, "R3_w=inv14"},
-                       {4, "R3_w=inv56"},
-               },
-       },
-
-       /* Tests using unknown values */
-#define PREP_PKT_POINTERS \
-       BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \
-                   offsetof(struct __sk_buff, data)), \
-       BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \
-                   offsetof(struct __sk_buff, data_end))
-
-#define LOAD_UNKNOWN(DST_REG) \
-       PREP_PKT_POINTERS, \
-       BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), \
-       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), \
-       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 1), \
-       BPF_EXIT_INSN(), \
-       BPF_LDX_MEM(BPF_B, DST_REG, BPF_REG_2, 0)
-
-       {
-               .descr = "unknown shift",
-               .insns = {
-                       LOAD_UNKNOWN(BPF_REG_3),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
-                       LOAD_UNKNOWN(BPF_REG_4),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
-                       {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-                       {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
-                       {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
-                       {18, "R3=pkt_end(id=0,off=0,imm=0)"},
-                       {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
-                       {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
-                       {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
-                       {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-               },
-       },
-       {
-               .descr = "unknown mul",
-               .insns = {
-                       LOAD_UNKNOWN(BPF_REG_3),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 1),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 4),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 8),
-                       BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-                       {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
-                       {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
-               },
-       },
-       {
-               .descr = "packet const offset",
-               .insns = {
-                       PREP_PKT_POINTERS,
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-
-                       /* Skip over ethernet header.  */
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-
-                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 0),
-                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 1),
-                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 2),
-                       BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 3),
-                       BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 0),
-                       BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 2),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
-                       {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
-                       {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
-                       {10, "R2=pkt(id=0,off=0,r=18,imm=0)"},
-                       {10, "R5=pkt(id=0,off=14,r=18,imm=0)"},
-                       {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
-                       {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
-               },
-       },
-       {
-               .descr = "packet variable offset",
-               .insns = {
-                       LOAD_UNKNOWN(BPF_REG_6),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
-
-                       /* First, add a constant to the R5 packet pointer,
-                        * then a variable with a known alignment.
-                        */
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
-                       /* Now, test in the other direction.  Adding first
-                        * the variable offset to R5, then the constant.
-                        */
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
-                       /* Test multiple accumulations of unknown values
-                        * into a packet pointer.
-                        */
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4),
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
-
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       /* Calculated offset in R6 has unknown value, but known
-                        * alignment of 4.
-                        */
-                       {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-                       {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Offset is added to packet pointer R5, resulting in
-                        * known fixed offset, and variable offset from R6.
-                        */
-                       {11, "R5_w=pkt(id=1,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* At the time the word size load is performed from R5,
-                        * it's total offset is NET_IP_ALIGN + reg->off (0) +
-                        * reg->aux_off (14) which is 16.  Then the variable
-                        * offset is considered using reg->aux_off_align which
-                        * is 4 and meets the load's requirements.
-                        */
-                       {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Variable offset is added to R5 packet pointer,
-                        * resulting in auxiliary alignment of 4.
-                        */
-                       {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Constant offset is added to R5, resulting in
-                        * reg->off of 14.
-                        */
-                       {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* At the time the word size load is performed from R5,
-                        * its total fixed offset is NET_IP_ALIGN + reg->off
-                        * (14) which is 16.  Then the variable offset is 4-byte
-                        * aligned, so the total offset is 4-byte aligned and
-                        * meets the load's requirements.
-                        */
-                       {23, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       {23, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Constant offset is added to R5 packet pointer,
-                        * resulting in reg->off value of 14.
-                        */
-                       {26, "R5_w=pkt(id=0,off=14,r=8"},
-                       /* Variable offset is added to R5, resulting in a
-                        * variable offset of (4n).
-                        */
-                       {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Constant is added to R5 again, setting reg->off to 18. */
-                       {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* And once more we add a variable; resulting var_off
-                        * is still (4n), fixed offset is not changed.
-                        * Also, we create a new reg->id.
-                        */
-                       {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"},
-                       /* At the time the word size load is performed from R5,
-                        * its total fixed offset is NET_IP_ALIGN + reg->off (18)
-                        * which is 20.  Then the variable offset is (4n), so
-                        * the total offset is 4-byte aligned and meets the
-                        * load's requirements.
-                        */
-                       {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
-                       {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
-               },
-       },
-       {
-               .descr = "packet variable offset 2",
-               .insns = {
-                       /* Create an unknown offset, (4n+2)-aligned */
-                       LOAD_UNKNOWN(BPF_REG_6),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
-                       /* Add it to the packet pointer */
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
-                       /* Check bounds and perform a read */
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
-                       /* Make a (4n) offset from the value we just read */
-                       BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xff),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
-                       /* Add it to the packet pointer */
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
-                       /* Check bounds and perform a read */
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       /* Calculated offset in R6 has unknown value, but known
-                        * alignment of 4.
-                        */
-                       {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-                       {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Adding 14 makes R6 be (4n+2) */
-                       {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
-                       /* Packet pointer has (4n+2) offset */
-                       {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
-                       {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
-                       /* At the time the word size load is performed from R5,
-                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
-                        * which is 2.  Then the variable offset is (4n+2), so
-                        * the total offset is 4-byte aligned and meets the
-                        * load's requirements.
-                        */
-                       {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
-                       /* Newly read value in R6 was shifted left by 2, so has
-                        * known alignment of 4.
-                        */
-                       {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Added (4n) to packet pointer's (4n+2) var_off, giving
-                        * another (4n+2).
-                        */
-                       {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
-                       {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
-                       /* At the time the word size load is performed from R5,
-                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
-                        * which is 2.  Then the variable offset is (4n+2), so
-                        * the total offset is 4-byte aligned and meets the
-                        * load's requirements.
-                        */
-                       {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
-               },
-       },
-       {
-               .descr = "dubious pointer arithmetic",
-               .insns = {
-                       PREP_PKT_POINTERS,
-                       BPF_MOV64_IMM(BPF_REG_0, 0),
-                       /* (ptr - ptr) << 2 */
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
-                       BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2),
-                       /* We have a (4n) value.  Let's make a packet offset
-                        * out of it.  First add 14, to make it a (4n+2)
-                        */
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
-                       /* Then make sure it's nonnegative */
-                       BPF_JMP_IMM(BPF_JSGE, BPF_REG_5, 0, 1),
-                       BPF_EXIT_INSN(),
-                       /* Add it to packet pointer */
-                       BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
-                       /* Check bounds and perform a read */
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_6),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_6, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .result = REJECT,
-               .matches = {
-                       {4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
-                       /* (ptr - ptr) << 2 == unknown, (4n) */
-                       {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"},
-                       /* (4n) + 14 == (4n+2).  We blow our bounds, because
-                        * the add could overflow.
-                        */
-                       {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"},
-                       /* Checked s>=0 */
-                       {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-                       /* packet pointer + nonnegative (4n+2) */
-                       {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-                       {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-                       /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
-                        * We checked the bounds, but it might have been able
-                        * to overflow if the packet pointer started in the
-                        * upper half of the address space.
-                        * So we did not get a 'range' on R6, and the access
-                        * attempt will fail.
-                        */
-                       {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
-               }
-       },
-       {
-               .descr = "variable subtraction",
-               .insns = {
-                       /* Create an unknown offset, (4n+2)-aligned */
-                       LOAD_UNKNOWN(BPF_REG_6),
-                       BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
-                       /* Create another unknown, (4n)-aligned, and subtract
-                        * it from the first one
-                        */
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
-                       BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_7),
-                       /* Bounds-check the result */
-                       BPF_JMP_IMM(BPF_JSGE, BPF_REG_6, 0, 1),
-                       BPF_EXIT_INSN(),
-                       /* Add it to the packet pointer */
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
-                       /* Check bounds and perform a read */
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       /* Calculated offset in R6 has unknown value, but known
-                        * alignment of 4.
-                        */
-                       {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-                       {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Adding 14 makes R6 be (4n+2) */
-                       {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
-                       /* New unknown value in R7 is (4n) */
-                       {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       /* Subtracting it from R6 blows our unsigned bounds */
-                       {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"},
-                       /* Checked s>= 0 */
-                       {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
-                       /* At the time the word size load is performed from R5,
-                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
-                        * which is 2.  Then the variable offset is (4n+2), so
-                        * the total offset is 4-byte aligned and meets the
-                        * load's requirements.
-                        */
-                       {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
-               },
-       },
-       {
-               .descr = "pointer variable subtraction",
-               .insns = {
-                       /* Create an unknown offset, (4n+2)-aligned and bounded
-                        * to [14,74]
-                        */
-                       LOAD_UNKNOWN(BPF_REG_6),
-                       BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
-                       BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xf),
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
-                       /* Subtract it from the packet pointer */
-                       BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-                       BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_6),
-                       /* Create another unknown, (4n)-aligned and >= 74.
-                        * That in fact means >= 76, since 74 % 4 == 2
-                        */
-                       BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 76),
-                       /* Add it to the packet pointer */
-                       BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_7),
-                       /* Check bounds and perform a read */
-                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
-                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
-                       BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
-                       BPF_EXIT_INSN(),
-                       BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
-                       BPF_EXIT_INSN(),
-               },
-               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
-               .matches = {
-                       /* Calculated offset in R6 has unknown value, but known
-                        * alignment of 4.
-                        */
-                       {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
-                       {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
-                       /* Adding 14 makes R6 be (4n+2) */
-                       {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
-                       /* Subtracting from packet pointer overflows ubounds */
-                       {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"},
-                       /* New unknown value in R7 is (4n), >= 76 */
-                       {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
-                       /* Adding it to packet pointer gives nice bounds again */
-                       {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
-                       /* At the time the word size load is performed from R5,
-                        * its total fixed offset is NET_IP_ALIGN + reg->off (0)
-                        * which is 2.  Then the variable offset is (4n+2), so
-                        * the total offset is 4-byte aligned and meets the
-                        * load's requirements.
-                        */
-                       {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
-               },
-       },
-};
-
-static int probe_filter_length(const struct bpf_insn *fp)
-{
-       int len;
-
-       for (len = MAX_INSNS - 1; len > 0; --len)
-               if (fp[len].code != 0 || fp[len].imm != 0)
-                       break;
-       return len + 1;
-}
-
-static char bpf_vlog[32768];
-
-static int do_test_single(struct bpf_align_test *test)
-{
-       struct bpf_insn *prog = test->insns;
-       int prog_type = test->prog_type;
-       char bpf_vlog_copy[32768];
-       const char *line_ptr;
-       int cur_line = -1;
-       int prog_len, i;
-       int fd_prog;
-       int ret;
-
-       prog_len = probe_filter_length(prog);
-       fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
-                                    prog, prog_len, BPF_F_STRICT_ALIGNMENT,
-                                    "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
-       if (fd_prog < 0 && test->result != REJECT) {
-               printf("Failed to load program.\n");
-               printf("%s", bpf_vlog);
-               ret = 1;
-       } else if (fd_prog >= 0 && test->result == REJECT) {
-               printf("Unexpected success to load!\n");
-               printf("%s", bpf_vlog);
-               ret = 1;
-               close(fd_prog);
-       } else {
-               ret = 0;
-               /* We make a local copy so that we can strtok() it */
-               strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy));
-               line_ptr = strtok(bpf_vlog_copy, "\n");
-               for (i = 0; i < MAX_MATCHES; i++) {
-                       struct bpf_reg_match m = test->matches[i];
-
-                       if (!m.match)
-                               break;
-                       while (line_ptr) {
-                               cur_line = -1;
-                               sscanf(line_ptr, "%u: ", &cur_line);
-                               if (cur_line == m.line)
-                                       break;
-                               line_ptr = strtok(NULL, "\n");
-                       }
-                       if (!line_ptr) {
-                               printf("Failed to find line %u for match: %s\n",
-                                      m.line, m.match);
-                               ret = 1;
-                               printf("%s", bpf_vlog);
-                               break;
-                       }
-                       if (!strstr(line_ptr, m.match)) {
-                               printf("Failed to find match %u: %s\n",
-                                      m.line, m.match);
-                               ret = 1;
-                               printf("%s", bpf_vlog);
-                               break;
-                       }
-               }
-               if (fd_prog >= 0)
-                       close(fd_prog);
-       }
-       return ret;
-}
-
-static int do_test(unsigned int from, unsigned int to)
-{
-       int all_pass = 0;
-       int all_fail = 0;
-       unsigned int i;
-
-       for (i = from; i < to; i++) {
-               struct bpf_align_test *test = &tests[i];
-               int fail;
-
-               printf("Test %3d: %s ... ",
-                      i, test->descr);
-               fail = do_test_single(test);
-               if (fail) {
-                       all_fail++;
-                       printf("FAIL\n");
-               } else {
-                       all_pass++;
-                       printf("PASS\n");
-               }
-       }
-       printf("Results: %d pass %d fail\n",
-              all_pass, all_fail);
-       return all_fail ? EXIT_FAILURE : EXIT_SUCCESS;
-}
-
-int main(int argc, char **argv)
-{
-       unsigned int from = 0, to = ARRAY_SIZE(tests);
-
-       if (argc == 3) {
-               unsigned int l = atoi(argv[argc - 2]);
-               unsigned int u = atoi(argv[argc - 1]);
-
-               if (l < to && u < to) {
-                       from = l;
-                       to   = u + 1;
-               }
-       } else if (argc == 2) {
-               unsigned int t = atoi(argv[argc - 1]);
-
-               if (t < to) {
-                       from = t;
-                       to   = t + 1;
-               }
-       }
-       return do_test(from, to);
-}
index 779e11da979c88e644c6c8f03f4091f0b440e16d..c80643828b82b919b1621fc7cc8383c7129ec55d 100644 (file)
@@ -54,7 +54,7 @@ static void running_handler(int a);
 #define S1_PORT 10000
 #define S2_PORT 10001
 
-#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKMAP_FILENAME  "test_sockmap_kern.o"
 #define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o"
 #define CG_PATH "/sockmap"
 
@@ -68,9 +68,7 @@ struct bpf_map *maps[8];
 int prog_fd[11];
 
 int txmsg_pass;
-int txmsg_noisy;
 int txmsg_redir;
-int txmsg_redir_noisy;
 int txmsg_drop;
 int txmsg_apply;
 int txmsg_cork;
@@ -89,15 +87,13 @@ static const struct option long_options[] = {
        {"help",        no_argument,            NULL, 'h' },
        {"cgroup",      required_argument,      NULL, 'c' },
        {"rate",        required_argument,      NULL, 'r' },
-       {"verbose",     no_argument,            NULL, 'v' },
+       {"verbose",     optional_argument,      NULL, 'v' },
        {"iov_count",   required_argument,      NULL, 'i' },
        {"length",      required_argument,      NULL, 'l' },
        {"test",        required_argument,      NULL, 't' },
        {"data_test",   no_argument,            NULL, 'd' },
        {"txmsg",               no_argument,    &txmsg_pass,  1  },
-       {"txmsg_noisy",         no_argument,    &txmsg_noisy, 1  },
        {"txmsg_redir",         no_argument,    &txmsg_redir, 1  },
-       {"txmsg_redir_noisy",   no_argument,    &txmsg_redir_noisy, 1},
        {"txmsg_drop",          no_argument,    &txmsg_drop, 1 },
        {"txmsg_apply", required_argument,      NULL, 'a'},
        {"txmsg_cork",  required_argument,      NULL, 'k'},
@@ -111,9 +107,104 @@ static const struct option long_options[] = {
        {"txmsg_skb", no_argument,              &txmsg_skb, 1 },
        {"ktls", no_argument,                   &ktls, 1 },
        {"peek", no_argument,                   &peek_flag, 1 },
+       {"whitelist", required_argument,        NULL, 'n' },
+       {"blacklist", required_argument,        NULL, 'b' },
        {0, 0, NULL, 0 }
 };
 
+struct test_env {
+       const char *type;
+       const char *subtest;
+       const char *prepend;
+
+       int test_num;
+       int subtest_num;
+
+       int succ_cnt;
+       int fail_cnt;
+       int fail_last;
+};
+
+struct test_env env;
+
+struct sockmap_options {
+       int verbose;
+       bool base;
+       bool sendpage;
+       bool data_test;
+       bool drop_expected;
+       int iov_count;
+       int iov_length;
+       int rate;
+       char *map;
+       char *whitelist;
+       char *blacklist;
+       char *prepend;
+};
+
+struct _test {
+       char *title;
+       void (*tester)(int cg_fd, struct sockmap_options *opt);
+};
+
+static void test_start(void)
+{
+       env.subtest_num++;
+}
+
+static void test_fail(void)
+{
+       env.fail_cnt++;
+}
+
+static void test_pass(void)
+{
+       env.succ_cnt++;
+}
+
+static void test_reset(void)
+{
+       txmsg_start = txmsg_end = 0;
+       txmsg_start_pop = txmsg_pop = 0;
+       txmsg_start_push = txmsg_end_push = 0;
+       txmsg_pass = txmsg_drop = txmsg_redir = 0;
+       txmsg_apply = txmsg_cork = 0;
+       txmsg_ingress = txmsg_skb = 0;
+}
+
+static int test_start_subtest(const struct _test *t, struct sockmap_options *o)
+{
+       env.type = o->map;
+       env.subtest = t->title;
+       env.prepend = o->prepend;
+       env.test_num++;
+       env.subtest_num = 0;
+       env.fail_last = env.fail_cnt;
+       test_reset();
+       return 0;
+}
+
+static void test_end_subtest(void)
+{
+       int error = env.fail_cnt - env.fail_last;
+       int type = strcmp(env.type, BPF_SOCKMAP_FILENAME);
+
+       if (!error)
+               test_pass();
+
+       fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n",
+               env.test_num, env.subtest_num,
+               !type ? "sockmap" : "sockhash",
+               env.prepend ? : "",
+               env.subtest, error ? "FAIL" : "OK");
+}
+
+static void test_print_results(void)
+{
+       fprintf(stdout, "Pass: %d Fail: %d\n",
+               env.succ_cnt, env.fail_cnt);
+}
+
 static void usage(char *argv[])
 {
        int i;
@@ -296,7 +387,7 @@ static int sockmap_init_sockets(int verbose)
                return errno;
        }
 
-       if (verbose) {
+       if (verbose > 1) {
                printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
                printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
                        c1, s1, c2, s2);
@@ -311,17 +402,6 @@ struct msg_stats {
        struct timespec end;
 };
 
-struct sockmap_options {
-       int verbose;
-       bool base;
-       bool sendpage;
-       bool data_test;
-       bool drop_expected;
-       int iov_count;
-       int iov_length;
-       int rate;
-};
-
 static int msg_loop_sendpage(int fd, int iov_length, int cnt,
                             struct msg_stats *s,
                             struct sockmap_options *opt)
@@ -345,14 +425,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
 
        clock_gettime(CLOCK_MONOTONIC, &s->start);
        for (i = 0; i < cnt; i++) {
-               int sent = sendfile(fd, fp, NULL, iov_length);
+               int sent;
+
+               errno = 0;
+               sent = sendfile(fd, fp, NULL, iov_length);
 
                if (!drop && sent < 0) {
-                       perror("send loop error");
+                       perror("sendpage loop error");
                        fclose(file);
                        return sent;
                } else if (drop && sent >= 0) {
-                       printf("sendpage loop error expected: %i\n", sent);
+                       printf("sendpage loop error expected: %i errno %i\n",
+                              sent, errno);
                        fclose(file);
                        return -EIO;
                }
@@ -464,13 +548,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
        if (tx) {
                clock_gettime(CLOCK_MONOTONIC, &s->start);
                for (i = 0; i < cnt; i++) {
-                       int sent = sendmsg(fd, &msg, flags);
+                       int sent;
+
+                       errno = 0;
+                       sent = sendmsg(fd, &msg, flags);
 
                        if (!drop && sent < 0) {
-                               perror("send loop error");
+                               perror("sendmsg loop error");
                                goto out_errno;
                        } else if (drop && sent >= 0) {
-                               printf("send loop error expected: %i\n", sent);
+                               fprintf(stderr,
+                                       "sendmsg loop error expected: %i errno %i\n",
+                                       sent, errno);
                                errno = -EIO;
                                goto out_errno;
                        }
@@ -497,9 +586,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
                 * paths.
                 */
                total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
-               txmsg_pop_total = txmsg_pop;
                if (txmsg_apply)
-                       txmsg_pop_total *= (total_bytes / txmsg_apply);
+                       txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply);
+               else
+                       txmsg_pop_total = txmsg_pop * cnt;
                total_bytes -= txmsg_pop_total;
                err = clock_gettime(CLOCK_MONOTONIC, &s->start);
                if (err < 0)
@@ -633,14 +723,18 @@ static int sendmsg_test(struct sockmap_options *opt)
 
        rxpid = fork();
        if (rxpid == 0) {
+               iov_buf -= (txmsg_pop - txmsg_start_pop + 1);
                if (opt->drop_expected)
-                       exit(0);
+                       _exit(0);
+
+               if (!iov_buf) /* zero bytes sent case */
+                       _exit(0);
 
                if (opt->sendpage)
                        iov_count = 1;
                err = msg_loop(rx_fd, iov_count, iov_buf,
                               cnt, &s, false, opt);
-               if (opt->verbose)
+               if (opt->verbose > 1)
                        fprintf(stderr,
                                "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
                                iov_count, iov_buf, cnt, err);
@@ -648,7 +742,7 @@ static int sendmsg_test(struct sockmap_options *opt)
                        sent_Bps = sentBps(s);
                        recvd_Bps = recvdBps(s);
                }
-               if (opt->verbose)
+               if (opt->verbose > 1)
                        fprintf(stdout,
                                "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n",
                                s.bytes_sent, sent_Bps, sent_Bps/giga,
@@ -678,7 +772,7 @@ static int sendmsg_test(struct sockmap_options *opt)
                        sent_Bps = sentBps(s);
                        recvd_Bps = recvdBps(s);
                }
-               if (opt->verbose)
+               if (opt->verbose > 1)
                        fprintf(stdout,
                                "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
                                s.bytes_sent, sent_Bps, sent_Bps/giga,
@@ -694,14 +788,14 @@ static int sendmsg_test(struct sockmap_options *opt)
        if (WIFEXITED(rx_status)) {
                err = WEXITSTATUS(rx_status);
                if (err) {
-                       fprintf(stderr, "rx thread exited with err %d. ", err);
+                       fprintf(stderr, "rx thread exited with err %d.\n", err);
                        goto out;
                }
        }
        if (WIFEXITED(tx_status)) {
                err = WEXITSTATUS(tx_status);
                if (err)
-                       fprintf(stderr, "tx thread exited with err %d. ", err);
+                       fprintf(stderr, "tx thread exited with err %d.\n", err);
        }
 out:
        return err;
@@ -783,6 +877,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt)
 }
 
 enum {
+       SELFTESTS,
        PING_PONG,
        SENDMSG,
        BASE,
@@ -834,19 +929,14 @@ run:
        /* Attach txmsg program to sockmap */
        if (txmsg_pass)
                tx_prog_fd = prog_fd[3];
-       else if (txmsg_noisy)
-               tx_prog_fd = prog_fd[4];
        else if (txmsg_redir)
+               tx_prog_fd = prog_fd[4];
+       else if (txmsg_apply)
                tx_prog_fd = prog_fd[5];
-       else if (txmsg_redir_noisy)
+       else if (txmsg_cork)
                tx_prog_fd = prog_fd[6];
        else if (txmsg_drop)
-               tx_prog_fd = prog_fd[9];
-       /* apply and cork must be last */
-       else if (txmsg_apply)
                tx_prog_fd = prog_fd[7];
-       else if (txmsg_cork)
-               tx_prog_fd = prog_fd[8];
        else
                tx_prog_fd = 0;
 
@@ -870,7 +960,7 @@ run:
                        goto out;
                }
 
-               if (txmsg_redir || txmsg_redir_noisy)
+               if (txmsg_redir)
                        redir_fd = c2;
                else
                        redir_fd = c1;
@@ -1112,12 +1202,8 @@ static void test_options(char *options)
 
        if (txmsg_pass)
                strncat(options, "pass,", OPTSTRING);
-       if (txmsg_noisy)
-               strncat(options, "pass_noisy,", OPTSTRING);
        if (txmsg_redir)
                strncat(options, "redir,", OPTSTRING);
-       if (txmsg_redir_noisy)
-               strncat(options, "redir_noisy,", OPTSTRING);
        if (txmsg_drop)
                strncat(options, "drop,", OPTSTRING);
        if (txmsg_apply) {
@@ -1168,416 +1254,283 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
 
        test_options(options);
 
-       fprintf(stdout,
-               "[TEST %i]: (%i, %i, %i, %s, %s): ",
-               test_cnt, opt->rate, opt->iov_count, opt->iov_length,
-               test_to_str(test), options);
-       fflush(stdout);
+       if (opt->verbose) {
+               fprintf(stdout,
+                       " [TEST %i]: (%i, %i, %i, %s, %s): ",
+                       test_cnt, opt->rate, opt->iov_count, opt->iov_length,
+                       test_to_str(test), options);
+               fflush(stdout);
+       }
        err = run_options(opt, cgrp, test);
-       fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED");
+       if (opt->verbose)
+               fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED");
        test_cnt++;
        !err ? passed++ : failed++;
        free(options);
        return err;
 }
 
-static int test_exec(int cgrp, struct sockmap_options *opt)
-{
-       int err = __test_exec(cgrp, SENDMSG, opt);
-
-       if (err)
-               goto out;
-
-       err = __test_exec(cgrp, SENDPAGE, opt);
-out:
-       return err;
-}
-
-static int test_loop(int cgrp)
-{
-       struct sockmap_options opt;
-
-       int err, i, l, r;
-
-       opt.verbose = 0;
-       opt.base = false;
-       opt.sendpage = false;
-       opt.data_test = false;
-       opt.drop_expected = false;
-       opt.iov_count = 0;
-       opt.iov_length = 0;
-       opt.rate = 0;
-
-       r = 1;
-       for (i = 1; i < 100; i += 33) {
-               for (l = 1; l < 100; l += 33) {
-                       opt.rate = r;
-                       opt.iov_count = i;
-                       opt.iov_length = l;
-                       err = test_exec(cgrp, &opt);
-                       if (err)
-                               goto out;
-               }
-       }
-       sched_yield();
-out:
-       return err;
-}
-
-static int test_txmsg(int cgrp)
+static void test_exec(int cgrp, struct sockmap_options *opt)
 {
+       int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME);
        int err;
 
-       txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
-       txmsg_apply = txmsg_cork = 0;
-       txmsg_ingress = txmsg_skb = 0;
-
-       txmsg_pass = 1;
-       err = test_loop(cgrp);
-       txmsg_pass = 0;
-       if (err)
-               goto out;
-
-       txmsg_redir = 1;
-       err = test_loop(cgrp);
-       txmsg_redir = 0;
-       if (err)
-               goto out;
-
-       txmsg_drop = 1;
-       err = test_loop(cgrp);
-       txmsg_drop = 0;
-       if (err)
-               goto out;
-
-       txmsg_redir = 1;
-       txmsg_ingress = 1;
-       err = test_loop(cgrp);
-       txmsg_redir = 0;
-       txmsg_ingress = 0;
-       if (err)
-               goto out;
-out:
-       txmsg_pass = 0;
-       txmsg_redir = 0;
-       txmsg_drop = 0;
-       return err;
+       if (type == 0) {
+               test_start();
+               err = __test_exec(cgrp, SENDMSG, opt);
+               if (err)
+                       test_fail();
+       } else {
+               test_start();
+               err = __test_exec(cgrp, SENDPAGE, opt);
+               if (err)
+                       test_fail();
+       }
 }
 
-static int test_send(struct sockmap_options *opt, int cgrp)
+static void test_send_one(struct sockmap_options *opt, int cgrp)
 {
-       int err;
-
        opt->iov_length = 1;
        opt->iov_count = 1;
        opt->rate = 1;
-       err = test_exec(cgrp, opt);
-       if (err)
-               goto out;
+       test_exec(cgrp, opt);
 
        opt->iov_length = 1;
        opt->iov_count = 1024;
        opt->rate = 1;
-       err = test_exec(cgrp, opt);
-       if (err)
-               goto out;
+       test_exec(cgrp, opt);
 
        opt->iov_length = 1024;
        opt->iov_count = 1;
        opt->rate = 1;
-       err = test_exec(cgrp, opt);
-       if (err)
-               goto out;
+       test_exec(cgrp, opt);
 
-       opt->iov_length = 1;
+}
+
+static void test_send_many(struct sockmap_options *opt, int cgrp)
+{
+       opt->iov_length = 3;
        opt->iov_count = 1;
        opt->rate = 512;
-       err = test_exec(cgrp, opt);
-       if (err)
-               goto out;
-
-       opt->iov_length = 256;
-       opt->iov_count = 1024;
-       opt->rate = 2;
-       err = test_exec(cgrp, opt);
-       if (err)
-               goto out;
+       test_exec(cgrp, opt);
 
        opt->rate = 100;
        opt->iov_count = 1;
        opt->iov_length = 5;
-       err = test_exec(cgrp, opt);
-       if (err)
-               goto out;
-out:
-       sched_yield();
-       return err;
+       test_exec(cgrp, opt);
 }
 
-static int test_mixed(int cgrp)
+static void test_send_large(struct sockmap_options *opt, int cgrp)
 {
-       struct sockmap_options opt = {0};
-       int err;
+       opt->iov_length = 256;
+       opt->iov_count = 1024;
+       opt->rate = 2;
+       test_exec(cgrp, opt);
+}
 
-       txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
-       txmsg_apply = txmsg_cork = 0;
-       txmsg_start = txmsg_end = 0;
-       txmsg_start_push = txmsg_end_push = 0;
-       txmsg_start_pop = txmsg_pop = 0;
+static void test_send(struct sockmap_options *opt, int cgrp)
+{
+       test_send_one(opt, cgrp);
+       test_send_many(opt, cgrp);
+       test_send_large(opt, cgrp);
+       sched_yield();
+}
 
+static void test_txmsg_pass(int cgrp, struct sockmap_options *opt)
+{
        /* Test small and large iov_count values with pass/redir/apply/cork */
        txmsg_pass = 1;
-       txmsg_redir = 0;
-       txmsg_apply = 1;
-       txmsg_cork = 0;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       test_send(opt, cgrp);
+}
 
-       txmsg_pass = 1;
-       txmsg_redir = 0;
-       txmsg_apply = 0;
-       txmsg_cork = 1;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+static void test_txmsg_redir(int cgrp, struct sockmap_options *opt)
+{
+       txmsg_redir = 1;
+       test_send(opt, cgrp);
+}
 
-       txmsg_pass = 1;
-       txmsg_redir = 0;
-       txmsg_apply = 1;
-       txmsg_cork = 1;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+static void test_txmsg_drop(int cgrp, struct sockmap_options *opt)
+{
+       txmsg_drop = 1;
+       test_send(opt, cgrp);
+}
 
-       txmsg_pass = 1;
-       txmsg_redir = 0;
-       txmsg_apply = 1024;
-       txmsg_cork = 0;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt)
+{
+       txmsg_pass = txmsg_drop = 0;
+       txmsg_ingress = txmsg_redir = 1;
+       test_send(opt, cgrp);
+}
 
+/* Test cork with hung data. This tests poor usage patterns where
+ * cork can leave data on the ring if user program is buggy and
+ * doesn't flush them somehow. They do take some time however
+ * because they wait for a timeout. Test pass, redir and cork with
+ * apply logic. Use cork size of 4097 with send_large to avoid
+ * aligning cork size with send size.
+ */
+static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt)
+{
        txmsg_pass = 1;
        txmsg_redir = 0;
+       txmsg_cork = 4097;
+       txmsg_apply = 4097;
+       test_send_large(opt, cgrp);
+
+       txmsg_pass = 0;
+       txmsg_redir = 1;
        txmsg_apply = 0;
-       txmsg_cork = 1024;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       txmsg_cork = 4097;
+       test_send_large(opt, cgrp);
 
-       txmsg_pass = 1;
+       txmsg_pass = 0;
+       txmsg_redir = 1;
+       txmsg_apply = 4097;
+       txmsg_cork = 4097;
+       test_send_large(opt, cgrp);
+}
+
+static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
+{
+       /* Test basic start/end */
+       txmsg_start = 1;
+       txmsg_end = 2;
+       test_send(opt, cgrp);
+
+       /* Test >4k pull */
+       txmsg_start = 4096;
+       txmsg_end = 9182;
+       test_send_large(opt, cgrp);
+
+       /* Test pull + redirect */
        txmsg_redir = 0;
-       txmsg_apply = 1024;
-       txmsg_cork = 1024;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       txmsg_start = 1;
+       txmsg_end = 2;
+       test_send(opt, cgrp);
 
-       txmsg_pass = 1;
+       /* Test pull + cork */
        txmsg_redir = 0;
-       txmsg_cork = 4096;
-       txmsg_apply = 4096;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       txmsg_cork = 512;
+       txmsg_start = 1;
+       txmsg_end = 2;
+       test_send_many(opt, cgrp);
 
-       txmsg_pass = 0;
+       /* Test pull + cork + redirect */
        txmsg_redir = 1;
-       txmsg_apply = 1;
-       txmsg_cork = 0;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       txmsg_cork = 512;
+       txmsg_start = 1;
+       txmsg_end = 2;
+       test_send_many(opt, cgrp);
+}
 
-       txmsg_pass = 0;
-       txmsg_redir = 1;
-       txmsg_apply = 0;
-       txmsg_cork = 1;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
+{
+       /* Test basic pop */
+       txmsg_start_pop = 1;
+       txmsg_pop = 2;
+       test_send_many(opt, cgrp);
 
-       txmsg_pass = 0;
-       txmsg_redir = 1;
-       txmsg_apply = 1024;
-       txmsg_cork = 0;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       /* Test pop with >4k */
+       txmsg_start_pop = 4096;
+       txmsg_pop = 4096;
+       test_send_large(opt, cgrp);
 
-       txmsg_pass = 0;
+       /* Test pop + redirect */
        txmsg_redir = 1;
-       txmsg_apply = 0;
-       txmsg_cork = 1024;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       txmsg_start_pop = 1;
+       txmsg_pop = 2;
+       test_send_many(opt, cgrp);
 
-       txmsg_pass = 0;
-       txmsg_redir = 1;
-       txmsg_apply = 1024;
-       txmsg_cork = 1024;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
+       /* Test pop + cork */
+       txmsg_redir = 0;
+       txmsg_cork = 512;
+       txmsg_start_pop = 1;
+       txmsg_pop = 2;
+       test_send_many(opt, cgrp);
 
-       txmsg_pass = 0;
+       /* Test pop + redirect + cork */
        txmsg_redir = 1;
-       txmsg_cork = 4096;
-       txmsg_apply = 4096;
-       err = test_send(&opt, cgrp);
-       if (err)
-               goto out;
-out:
-       return err;
+       txmsg_cork = 4;
+       txmsg_start_pop = 1;
+       txmsg_pop = 2;
+       test_send_many(opt, cgrp);
 }
 
-static int test_start_end(int cgrp)
+static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
 {
-       struct sockmap_options opt = {0};
-       int err, i;
+       /* Test basic push */
+       txmsg_start_push = 1;
+       txmsg_end_push = 1;
+       test_send(opt, cgrp);
 
-       /* Test basic start/end with lots of iov_count and iov_lengths */
-       txmsg_start = 1;
-       txmsg_end = 2;
+       /* Test push 4kB >4k */
+       txmsg_start_push = 4096;
+       txmsg_end_push = 4096;
+       test_send_large(opt, cgrp);
+
+       /* Test push + redirect */
+       txmsg_redir = 1;
        txmsg_start_push = 1;
        txmsg_end_push = 2;
-       txmsg_start_pop = 1;
-       txmsg_pop = 1;
-       err = test_txmsg(cgrp);
-       if (err)
-               goto out;
+       test_send_many(opt, cgrp);
 
-       /* Cut a byte of pushed data but leave reamining in place */
-       txmsg_start = 1;
-       txmsg_end = 2;
+       /* Test push + cork */
+       txmsg_redir = 0;
+       txmsg_cork = 512;
        txmsg_start_push = 1;
-       txmsg_end_push = 3;
-       txmsg_start_pop = 1;
-       txmsg_pop = 1;
-       err = test_txmsg(cgrp);
-       if (err)
-               goto out;
-
-       /* Test start/end with cork */
-       opt.rate = 16;
-       opt.iov_count = 1;
-       opt.iov_length = 100;
-       txmsg_cork = 1600;
-
-       txmsg_start_pop = 0;
-       txmsg_pop = 0;
-
-       for (i = 99; i <= 1600; i += 500) {
-               txmsg_start = 0;
-               txmsg_end = i;
-               txmsg_start_push = 0;
-               txmsg_end_push = i;
-               err = test_exec(cgrp, &opt);
-               if (err)
-                       goto out;
-       }
-
-       /* Test pop data in middle of cork */
-       for (i = 99; i <= 1600; i += 500) {
-               txmsg_start_pop = 10;
-               txmsg_pop = i;
-               err = test_exec(cgrp, &opt);
-               if (err)
-                       goto out;
-       }
-       txmsg_start_pop = 0;
-       txmsg_pop = 0;
-
-       /* Test start/end with cork but pull data in middle */
-       for (i = 199; i <= 1600; i += 500) {
-               txmsg_start = 100;
-               txmsg_end = i;
-               txmsg_start_push = 100;
-               txmsg_end_push = i;
-               err = test_exec(cgrp, &opt);
-               if (err)
-                       goto out;
-       }
-
-       /* Test start/end with cork pulling last sg entry */
-       txmsg_start = 1500;
-       txmsg_end = 1600;
-       txmsg_start_push = 1500;
-       txmsg_end_push = 1600;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
+       txmsg_end_push = 2;
+       test_send_many(opt, cgrp);
+}
 
-       /* Test pop with cork pulling last sg entry */
-       txmsg_start_pop = 1500;
-       txmsg_pop = 1600;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
-       txmsg_start_pop = 0;
-       txmsg_pop = 0;
-
-       /* Test start/end pull of single byte in last page */
-       txmsg_start = 1111;
-       txmsg_end = 1112;
-       txmsg_start_push = 1111;
-       txmsg_end_push = 1112;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
+static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt)
+{
+       txmsg_start_push = 1;
+       txmsg_end_push = 10;
+       txmsg_start_pop = 5;
+       txmsg_pop = 4;
+       test_send_large(opt, cgrp);
+}
 
-       /* Test pop of single byte in last page */
-       txmsg_start_pop = 1111;
-       txmsg_pop = 1112;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
+static void test_txmsg_apply(int cgrp, struct sockmap_options *opt)
+{
+       txmsg_pass = 1;
+       txmsg_redir = 0;
+       txmsg_apply = 1;
+       txmsg_cork = 0;
+       test_send_one(opt, cgrp);
 
-       /* Test start/end with end < start */
-       txmsg_start = 1111;
-       txmsg_end = 0;
-       txmsg_start_push = 1111;
-       txmsg_end_push = 0;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
+       txmsg_pass = 0;
+       txmsg_redir = 1;
+       txmsg_apply = 1;
+       txmsg_cork = 0;
+       test_send_one(opt, cgrp);
 
-       /* Test start/end with end > data */
-       txmsg_start = 0;
-       txmsg_end = 1601;
-       txmsg_start_push = 0;
-       txmsg_end_push = 1601;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
+       txmsg_pass = 1;
+       txmsg_redir = 0;
+       txmsg_apply = 1024;
+       txmsg_cork = 0;
+       test_send_large(opt, cgrp);
 
-       /* Test start/end with start > data */
-       txmsg_start = 1601;
-       txmsg_end = 1600;
-       txmsg_start_push = 1601;
-       txmsg_end_push = 1600;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
+       txmsg_pass = 0;
+       txmsg_redir = 1;
+       txmsg_apply = 1024;
+       txmsg_cork = 0;
+       test_send_large(opt, cgrp);
+}
 
-       /* Test pop with start > data */
-       txmsg_start_pop = 1601;
-       txmsg_pop = 1;
-       err = test_exec(cgrp, &opt);
-       if (err)
-               goto out;
+static void test_txmsg_cork(int cgrp, struct sockmap_options *opt)
+{
+       txmsg_pass = 1;
+       txmsg_redir = 0;
+       txmsg_apply = 0;
+       txmsg_cork = 1;
+       test_send(opt, cgrp);
 
-       /* Test pop with pop range > data */
-       txmsg_start_pop = 1599;
-       txmsg_pop = 10;
-       err = test_exec(cgrp, &opt);
-out:
-       txmsg_start = 0;
-       txmsg_end = 0;
-       sched_yield();
-       return err;
+       txmsg_pass = 1;
+       txmsg_redir = 0;
+       txmsg_apply = 1;
+       txmsg_cork = 1;
+       test_send(opt, cgrp);
 }
 
 char *map_names[] = {
@@ -1662,73 +1615,116 @@ static int populate_progs(char *bpf_file)
        return 0;
 }
 
-static int __test_suite(int cg_fd, char *bpf_file)
+struct _test test[] = {
+       {"txmsg test passthrough", test_txmsg_pass},
+       {"txmsg test redirect", test_txmsg_redir},
+       {"txmsg test drop", test_txmsg_drop},
+       {"txmsg test ingress redirect", test_txmsg_ingress_redir},
+       {"txmsg test apply", test_txmsg_apply},
+       {"txmsg test cork", test_txmsg_cork},
+       {"txmsg test hanging corks", test_txmsg_cork_hangs},
+       {"txmsg test push_data", test_txmsg_push},
+       {"txmsg test pull-data", test_txmsg_pull},
+       {"txmsg test pop-data", test_txmsg_pop},
+       {"txmsg test push/pop data", test_txmsg_push_pop},
+};
+
+static int check_whitelist(struct _test *t, struct sockmap_options *opt)
 {
-       int err, cleanup = cg_fd;
+       char *entry, *ptr;
+
+       if (!opt->whitelist)
+               return 0;
+       ptr = strdup(opt->whitelist);
+       if (!ptr)
+               return -ENOMEM;
+       entry = strtok(ptr, ",");
+       while (entry) {
+               if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+                   strstr(opt->map, entry) != 0 ||
+                   strstr(t->title, entry) != 0)
+                       return 0;
+               entry = strtok(NULL, ",");
+       }
+       return -EINVAL;
+}
 
-       err = populate_progs(bpf_file);
+static int check_blacklist(struct _test *t, struct sockmap_options *opt)
+{
+       char *entry, *ptr;
+
+       if (!opt->blacklist)
+               return -EINVAL;
+       ptr = strdup(opt->blacklist);
+       if (!ptr)
+               return -ENOMEM;
+       entry = strtok(ptr, ",");
+       while (entry) {
+               if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+                   strstr(opt->map, entry) != 0 ||
+                   strstr(t->title, entry) != 0)
+                       return 0;
+               entry = strtok(NULL, ",");
+       }
+       return -EINVAL;
+}
+
+static int __test_selftests(int cg_fd, struct sockmap_options *opt)
+{
+       int i, err;
+
+       err = populate_progs(opt->map);
        if (err < 0) {
                fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
                return err;
        }
 
-       if (cg_fd < 0) {
-               if (setup_cgroup_environment()) {
-                       fprintf(stderr, "ERROR: cgroup env failed\n");
-                       return -EINVAL;
-               }
+       /* Tests basic commands and APIs */
+       for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) {
+               struct _test t = test[i];
 
-               cg_fd = create_and_get_cgroup(CG_PATH);
-               if (cg_fd < 0) {
-                       fprintf(stderr,
-                               "ERROR: (%i) open cg path failed: %s\n",
-                               cg_fd, optarg);
-                       return cg_fd;
-               }
+               if (check_whitelist(&t, opt) != 0)
+                       continue;
+               if (check_blacklist(&t, opt) == 0)
+                       continue;
 
-               if (join_cgroup(CG_PATH)) {
-                       fprintf(stderr, "ERROR: failed to join cgroup\n");
-                       return -EINVAL;
-               }
+               test_start_subtest(&t, opt);
+               t.tester(cg_fd, opt);
+               test_end_subtest();
        }
 
-       /* Tests basic commands and APIs with range of iov values */
-       txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0;
-       err = test_txmsg(cg_fd);
-       if (err)
-               goto out;
+       return err;
+}
 
-       /* Tests interesting combinations of APIs used together */
-       err = test_mixed(cg_fd);
-       if (err)
-               goto out;
+static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt)
+{
+       opt->map = BPF_SOCKMAP_FILENAME;
+       __test_selftests(cg_fd, opt);
+}
 
-       /* Tests pull_data API using start/end API */
-       err = test_start_end(cg_fd);
-       if (err)
-               goto out;
+static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt)
+{
+       opt->map = BPF_SOCKHASH_FILENAME;
+       __test_selftests(cg_fd, opt);
+}
 
-out:
-       printf("Summary: %i PASSED %i FAILED\n", passed, failed);
-       if (cleanup < 0) {
-               cleanup_cgroup_environment();
-               close(cg_fd);
-       }
-       return err;
+static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt)
+{
+       opt->map = BPF_SOCKHASH_FILENAME;
+       opt->prepend = "ktls";
+       ktls = 1;
+       __test_selftests(cg_fd, opt);
+       ktls = 0;
 }
 
-static int test_suite(int cg_fd)
+static int test_selftest(int cg_fd, struct sockmap_options *opt)
 {
-       int err;
 
-       err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME);
-       if (err)
-               goto out;
-       err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME);
-out:
-       if (cg_fd > -1)
-               close(cg_fd);
-       return err;
+       test_selftests_sockmap(cg_fd, opt);
+       test_selftests_sockhash(cg_fd, opt);
+       test_selftests_ktls(cg_fd, opt);
+       test_print_results();
+       return 0;
 }
 
 int main(int argc, char **argv)
@@ -1737,12 +1733,10 @@ int main(int argc, char **argv)
        struct sockmap_options options = {0};
        int opt, longindex, err, cg_fd = 0;
        char *bpf_file = BPF_SOCKMAP_FILENAME;
-       int test = PING_PONG;
+       int test = SELFTESTS;
+       bool cg_created = 0;
 
-       if (argc < 2)
-               return test_suite(-1);
-
-       while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
+       while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:",
                                  long_options, &longindex)) != -1) {
                switch (opt) {
                case 's':
@@ -1783,6 +1777,8 @@ int main(int argc, char **argv)
                        break;
                case 'v':
                        options.verbose = 1;
+                       if (optarg)
+                               options.verbose = atoi(optarg);
                        break;
                case 'i':
                        iov_count = atoi(optarg);
@@ -1809,6 +1805,15 @@ int main(int argc, char **argv)
                                return -1;
                        }
                        break;
+               case 'n':
+                       options.whitelist = strdup(optarg);
+                       if (!options.whitelist)
+                               return -ENOMEM;
+                       break;
+               case 'b':
+                       options.blacklist = strdup(optarg);
+                       if (!options.blacklist)
+                               return -ENOMEM;
                case 0:
                        break;
                case 'h':
@@ -1818,13 +1823,30 @@ int main(int argc, char **argv)
                }
        }
 
-       if (argc <= 3 && cg_fd)
-               return test_suite(cg_fd);
-
        if (!cg_fd) {
-               fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
-                       argv[0]);
-               return -1;
+               if (setup_cgroup_environment()) {
+                       fprintf(stderr, "ERROR: cgroup env failed\n");
+                       return -EINVAL;
+               }
+
+               cg_fd = create_and_get_cgroup(CG_PATH);
+               if (cg_fd < 0) {
+                       fprintf(stderr,
+                               "ERROR: (%i) open cg path failed: %s\n",
+                               cg_fd, strerror(errno));
+                       return cg_fd;
+               }
+
+               if (join_cgroup(CG_PATH)) {
+                       fprintf(stderr, "ERROR: failed to join cgroup\n");
+                       return -EINVAL;
+               }
+               cg_created = 1;
+       }
+
+       if (test == SELFTESTS) {
+               err = test_selftest(cg_fd, &options);
+               goto out;
        }
 
        err = populate_progs(bpf_file);
@@ -1843,6 +1865,13 @@ int main(int argc, char **argv)
        options.rate = rate;
 
        err = run_options(&options, cg_fd, test);
+out:
+       if (options.whitelist)
+               free(options.whitelist);
+       if (options.blacklist)
+               free(options.blacklist);
+       if (cg_created)
+               cleanup_cgroup_environment();
        close(cg_fd);
        return err;
 }
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
deleted file mode 100644 (file)
index 9b4d3a6..0000000
+++ /dev/null
@@ -1,451 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
-#include <stddef.h>
-#include <string.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/in.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/pkt_cls.h>
-#include <sys/socket.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_endian.h>
-
-/* Sockmap sample program connects a client and a backend together
- * using cgroups.
- *
- *    client:X <---> frontend:80 client:X <---> backend:80
- *
- * For simplicity we hard code values here and bind 1:1. The hard
- * coded values are part of the setup in sockmap.sh script that
- * is associated with this BPF program.
- *
- * The bpf_printk is verbose and prints information as connections
- * are established and verdicts are decided.
- */
-
-struct {
-       __uint(type, TEST_MAP_TYPE);
-       __uint(max_entries, 20);
-       __uint(key_size, sizeof(int));
-       __uint(value_size, sizeof(int));
-} sock_map SEC(".maps");
-
-struct {
-       __uint(type, TEST_MAP_TYPE);
-       __uint(max_entries, 20);
-       __uint(key_size, sizeof(int));
-       __uint(value_size, sizeof(int));
-} sock_map_txmsg SEC(".maps");
-
-struct {
-       __uint(type, TEST_MAP_TYPE);
-       __uint(max_entries, 20);
-       __uint(key_size, sizeof(int));
-       __uint(value_size, sizeof(int));
-} sock_map_redir SEC(".maps");
-
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
-       __uint(max_entries, 1);
-       __type(key, int);
-       __type(value, int);
-} sock_apply_bytes SEC(".maps");
-
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
-       __uint(max_entries, 1);
-       __type(key, int);
-       __type(value, int);
-} sock_cork_bytes SEC(".maps");
-
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
-       __uint(max_entries, 6);
-       __type(key, int);
-       __type(value, int);
-} sock_bytes SEC(".maps");
-
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
-       __uint(max_entries, 1);
-       __type(key, int);
-       __type(value, int);
-} sock_redir_flags SEC(".maps");
-
-struct {
-       __uint(type, BPF_MAP_TYPE_ARRAY);
-       __uint(max_entries, 1);
-       __type(key, int);
-       __type(value, int);
-} sock_skb_opts SEC(".maps");
-
-SEC("sk_skb1")
-int bpf_prog1(struct __sk_buff *skb)
-{
-       return skb->len;
-}
-
-SEC("sk_skb2")
-int bpf_prog2(struct __sk_buff *skb)
-{
-       __u32 lport = skb->local_port;
-       __u32 rport = skb->remote_port;
-       int len, *f, ret, zero = 0;
-       __u64 flags = 0;
-
-       if (lport == 10000)
-               ret = 10;
-       else
-               ret = 1;
-
-       len = (__u32)skb->data_end - (__u32)skb->data;
-       f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
-       if (f && *f) {
-               ret = 3;
-               flags = *f;
-       }
-
-       bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
-                  len, flags);
-#ifdef SOCKMAP
-       return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
-#else
-       return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags);
-#endif
-
-}
-
-SEC("sockops")
-int bpf_sockmap(struct bpf_sock_ops *skops)
-{
-       __u32 lport, rport;
-       int op, err = 0, index, key, ret;
-
-
-       op = (int) skops->op;
-
-       switch (op) {
-       case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
-               lport = skops->local_port;
-               rport = skops->remote_port;
-
-               if (lport == 10000) {
-                       ret = 1;
-#ifdef SOCKMAP
-                       err = bpf_sock_map_update(skops, &sock_map, &ret,
-                                                 BPF_NOEXIST);
-#else
-                       err = bpf_sock_hash_update(skops, &sock_map, &ret,
-                                                  BPF_NOEXIST);
-#endif
-                       bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
-                                  lport, bpf_ntohl(rport), err);
-               }
-               break;
-       case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
-               lport = skops->local_port;
-               rport = skops->remote_port;
-
-               if (bpf_ntohl(rport) == 10001) {
-                       ret = 10;
-#ifdef SOCKMAP
-                       err = bpf_sock_map_update(skops, &sock_map, &ret,
-                                                 BPF_NOEXIST);
-#else
-                       err = bpf_sock_hash_update(skops, &sock_map, &ret,
-                                                  BPF_NOEXIST);
-#endif
-                       bpf_printk("active(%i -> %i) map ctx update err: %d\n",
-                                  lport, bpf_ntohl(rport), err);
-               }
-               break;
-       default:
-               break;
-       }
-
-       return 0;
-}
-
-SEC("sk_msg1")
-int bpf_prog4(struct sk_msg_md *msg)
-{
-       int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
-       int *start, *end, *start_push, *end_push, *start_pop, *pop;
-
-       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-       if (bytes)
-               bpf_msg_apply_bytes(msg, *bytes);
-       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-       if (bytes)
-               bpf_msg_cork_bytes(msg, *bytes);
-       start = bpf_map_lookup_elem(&sock_bytes, &zero);
-       end = bpf_map_lookup_elem(&sock_bytes, &one);
-       if (start && end)
-               bpf_msg_pull_data(msg, *start, *end, 0);
-       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
-       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
-       if (start_push && end_push)
-               bpf_msg_push_data(msg, *start_push, *end_push, 0);
-       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
-       pop = bpf_map_lookup_elem(&sock_bytes, &five);
-       if (start_pop && pop)
-               bpf_msg_pop_data(msg, *start_pop, *pop, 0);
-       return SK_PASS;
-}
-
-SEC("sk_msg2")
-int bpf_prog5(struct sk_msg_md *msg)
-{
-       int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
-       int *start, *end, *start_push, *end_push, *start_pop, *pop;
-       int *bytes, len1, len2 = 0, len3, len4;
-       int err1 = -1, err2 = -1;
-
-       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-       if (bytes)
-               err1 = bpf_msg_apply_bytes(msg, *bytes);
-       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-       if (bytes)
-               err2 = bpf_msg_cork_bytes(msg, *bytes);
-       len1 = (__u64)msg->data_end - (__u64)msg->data;
-       start = bpf_map_lookup_elem(&sock_bytes, &zero);
-       end = bpf_map_lookup_elem(&sock_bytes, &one);
-       if (start && end) {
-               int err;
-
-               bpf_printk("sk_msg2: pull(%i:%i)\n",
-                          start ? *start : 0, end ? *end : 0);
-               err = bpf_msg_pull_data(msg, *start, *end, 0);
-               if (err)
-                       bpf_printk("sk_msg2: pull_data err %i\n",
-                                  err);
-               len2 = (__u64)msg->data_end - (__u64)msg->data;
-               bpf_printk("sk_msg2: length update %i->%i\n",
-                          len1, len2);
-       }
-
-       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
-       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
-       if (start_push && end_push) {
-               int err;
-
-               bpf_printk("sk_msg2: push(%i:%i)\n",
-                          start_push ? *start_push : 0,
-                          end_push ? *end_push : 0);
-               err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
-               if (err)
-                       bpf_printk("sk_msg2: push_data err %i\n", err);
-               len3 = (__u64)msg->data_end - (__u64)msg->data;
-               bpf_printk("sk_msg2: length push_update %i->%i\n",
-                          len2 ? len2 : len1, len3);
-       }
-       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
-       pop = bpf_map_lookup_elem(&sock_bytes, &five);
-       if (start_pop && pop) {
-               int err;
-
-               bpf_printk("sk_msg2: pop(%i@%i)\n",
-                          start_pop, pop);
-               err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
-               if (err)
-                       bpf_printk("sk_msg2: pop_data err %i\n", err);
-               len4 = (__u64)msg->data_end - (__u64)msg->data;
-               bpf_printk("sk_msg2: length pop_data %i->%i\n",
-                          len1 ? len1 : 0,  len4);
-       }
-
-       bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
-                  len1, err1, err2);
-       return SK_PASS;
-}
-
-SEC("sk_msg3")
-int bpf_prog6(struct sk_msg_md *msg)
-{
-       int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
-       int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
-       __u64 flags = 0;
-
-       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-       if (bytes)
-               bpf_msg_apply_bytes(msg, *bytes);
-       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-       if (bytes)
-               bpf_msg_cork_bytes(msg, *bytes);
-
-       start = bpf_map_lookup_elem(&sock_bytes, &zero);
-       end = bpf_map_lookup_elem(&sock_bytes, &one);
-       if (start && end)
-               bpf_msg_pull_data(msg, *start, *end, 0);
-
-       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
-       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
-       if (start_push && end_push)
-               bpf_msg_push_data(msg, *start_push, *end_push, 0);
-
-       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
-       pop = bpf_map_lookup_elem(&sock_bytes, &five);
-       if (start_pop && pop)
-               bpf_msg_pop_data(msg, *start_pop, *pop, 0);
-
-       f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-       if (f && *f) {
-               key = 2;
-               flags = *f;
-       }
-#ifdef SOCKMAP
-       return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-#else
-       return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
-#endif
-}
-
-SEC("sk_msg4")
-int bpf_prog7(struct sk_msg_md *msg)
-{
-       int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
-       int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
-       int len1, len2 = 0, len3, len4;
-       int err1 = 0, err2 = 0, key = 0;
-       __u64 flags = 0;
-
-               int err;
-       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-       if (bytes)
-               err1 = bpf_msg_apply_bytes(msg, *bytes);
-       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-       if (bytes)
-               err2 = bpf_msg_cork_bytes(msg, *bytes);
-       len1 = (__u64)msg->data_end - (__u64)msg->data;
-
-       start = bpf_map_lookup_elem(&sock_bytes, &zero);
-       end = bpf_map_lookup_elem(&sock_bytes, &one);
-       if (start && end) {
-               bpf_printk("sk_msg2: pull(%i:%i)\n",
-                          start ? *start : 0, end ? *end : 0);
-               err = bpf_msg_pull_data(msg, *start, *end, 0);
-               if (err)
-                       bpf_printk("sk_msg2: pull_data err %i\n",
-                                  err);
-               len2 = (__u64)msg->data_end - (__u64)msg->data;
-               bpf_printk("sk_msg2: length update %i->%i\n",
-                          len1, len2);
-       }
-
-       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
-       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
-       if (start_push && end_push) {
-               bpf_printk("sk_msg4: push(%i:%i)\n",
-                          start_push ? *start_push : 0,
-                          end_push ? *end_push : 0);
-               err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
-               if (err)
-                       bpf_printk("sk_msg4: push_data err %i\n",
-                                  err);
-               len3 = (__u64)msg->data_end - (__u64)msg->data;
-               bpf_printk("sk_msg4: length push_update %i->%i\n",
-                          len2 ? len2 : len1, len3);
-       }
-
-       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
-       pop = bpf_map_lookup_elem(&sock_bytes, &five);
-       if (start_pop && pop) {
-               int err;
-
-               bpf_printk("sk_msg4: pop(%i@%i)\n",
-                          start_pop, pop);
-               err = bpf_msg_pop_data(msg, *start_pop, *pop, 0);
-               if (err)
-                       bpf_printk("sk_msg4: pop_data err %i\n", err);
-               len4 = (__u64)msg->data_end - (__u64)msg->data;
-               bpf_printk("sk_msg4: length pop_data %i->%i\n",
-                          len1 ? len1 : 0,  len4);
-       }
-
-
-       f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-       if (f && *f) {
-               key = 2;
-               flags = *f;
-       }
-       bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
-                  len1, flags, err1 ? err1 : err2);
-#ifdef SOCKMAP
-       err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-#else
-       err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
-#endif
-       bpf_printk("sk_msg3: err %i\n", err);
-       return err;
-}
-
-SEC("sk_msg5")
-int bpf_prog8(struct sk_msg_md *msg)
-{
-       void *data_end = (void *)(long) msg->data_end;
-       void *data = (void *)(long) msg->data;
-       int ret = 0, *bytes, zero = 0;
-
-       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-       if (bytes) {
-               ret = bpf_msg_apply_bytes(msg, *bytes);
-               if (ret)
-                       return SK_DROP;
-       } else {
-               return SK_DROP;
-       }
-       return SK_PASS;
-}
-SEC("sk_msg6")
-int bpf_prog9(struct sk_msg_md *msg)
-{
-       void *data_end = (void *)(long) msg->data_end;
-       void *data = (void *)(long) msg->data;
-       int ret = 0, *bytes, zero = 0;
-
-       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-       if (bytes) {
-               if (((__u64)data_end - (__u64)data) >= *bytes)
-                       return SK_PASS;
-               ret = bpf_msg_cork_bytes(msg, *bytes);
-               if (ret)
-                       return SK_DROP;
-       }
-       return SK_PASS;
-}
-
-SEC("sk_msg7")
-int bpf_prog10(struct sk_msg_md *msg)
-{
-       int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
-       int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
-
-       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-       if (bytes)
-               bpf_msg_apply_bytes(msg, *bytes);
-       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-       if (bytes)
-               bpf_msg_cork_bytes(msg, *bytes);
-       start = bpf_map_lookup_elem(&sock_bytes, &zero);
-       end = bpf_map_lookup_elem(&sock_bytes, &one);
-       if (start && end)
-               bpf_msg_pull_data(msg, *start, *end, 0);
-       start_push = bpf_map_lookup_elem(&sock_bytes, &two);
-       end_push = bpf_map_lookup_elem(&sock_bytes, &three);
-       if (start_push && end_push)
-               bpf_msg_push_data(msg, *start_push, *end_push, 0);
-       start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
-       pop = bpf_map_lookup_elem(&sock_bytes, &five);
-       if (start_pop && pop)
-               bpf_msg_pop_data(msg, *start_pop, *pop, 0);
-       bpf_printk("return sk drop\n");
-       return SK_DROP;
-}
-
-int _version SEC("version") = 1;
-char _license[] SEC("license") = "GPL";
index 604b4615173637ce4476ebd0ef6098fce9a03930..056e0273bf1252414a1567cacb50131b1338b625 100644 (file)
        .result = REJECT,
        .errstr = "invalid mem access",
 },
+{
+       "reference tracking: branch tracking valid pointer null comparison",
+       .insns = {
+       BPF_SK_LOOKUP(sk_lookup_tcp),
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+       BPF_MOV64_IMM(BPF_REG_3, 1),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+       BPF_MOV64_IMM(BPF_REG_3, 0),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+       BPF_EMIT_CALL(BPF_FUNC_sk_release),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .result = ACCEPT,
+},
+{
+       "reference tracking: branch tracking valid pointer value comparison",
+       .insns = {
+       BPF_SK_LOOKUP(sk_lookup_tcp),
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+       BPF_MOV64_IMM(BPF_REG_3, 1),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4),
+       BPF_MOV64_IMM(BPF_REG_3, 0),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+       BPF_EMIT_CALL(BPF_FUNC_sk_release),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .errstr = "Unreleased reference",
+       .result = REJECT,
+},
index 860d4a71cd83a10f7116ada1961da382970de9cd..3ecb70a3d93970425413dee6cd402bdcdf6c4ed9 100644 (file)
        .result_unpriv = REJECT,
        .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
+{
+       "map lookup and null branch prediction",
+       .insns = {
+       BPF_MOV64_IMM(BPF_REG_1, 10),
+       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_8b = { 4 },
+       .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       .result = ACCEPT,
+},
index 50d822face3623d1b85c809179b12cd06a960246..51f8e9afe6aefb9c4337ff9c8e81cab69c19acac 100755 (executable)
@@ -19,8 +19,8 @@ ret=0
 ksft_skip=4
 
 # all tests in this script. Can be overridden with -t option
-IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode"
-IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode"
+IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode ipv4_fdb_grp_fcnal"
+IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode ipv6_fdb_grp_fcnal"
 
 ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}"
 TESTS="${ALL_TESTS}"
@@ -146,6 +146,7 @@ setup()
        create_ns remote
 
        IP="ip -netns me"
+       BRIDGE="bridge -netns me"
        set -e
        $IP li add veth1 type veth peer name veth2
        $IP li set veth1 up
@@ -280,6 +281,161 @@ stop_ip_monitor()
        return $rc
 }
 
+check_nexthop_fdb_support()
+{
+       $IP nexthop help 2>&1 | grep -q fdb
+       if [ $? -ne 0 ]; then
+               echo "SKIP: iproute2 too old, missing fdb nexthop support"
+               return $ksft_skip
+       fi
+}
+
+ipv6_fdb_grp_fcnal()
+{
+       local rc
+
+       echo
+       echo "IPv6 fdb groups functional"
+       echo "--------------------------"
+
+       check_nexthop_fdb_support
+       if [ $? -eq $ksft_skip ]; then
+               return $ksft_skip
+       fi
+
+       # create group with multiple nexthops
+       run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb"
+       run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb"
+       run_cmd "$IP nexthop add id 102 group 61/62 fdb"
+       check_nexthop "id 102" "id 102 group 61/62 fdb"
+       log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+       ## get nexthop group
+       run_cmd "$IP nexthop get id 102"
+       check_nexthop "id 102" "id 102 group 61/62 fdb"
+       log_test $? 0 "Get Fdb nexthop group by id"
+
+       # fdb nexthop group can only contain fdb nexthops
+       run_cmd "$IP nexthop add id 63 via 2001:db8:91::4"
+       run_cmd "$IP nexthop add id 64 via 2001:db8:91::5"
+       run_cmd "$IP nexthop add id 103 group 63/64 fdb"
+       log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+       # Non fdb nexthop group can not contain fdb nexthops
+       run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb"
+       run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb"
+       run_cmd "$IP nexthop add id 104 group 65/66"
+       log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+       # fdb nexthop cannot have blackhole
+       run_cmd "$IP nexthop add id 67 blackhole fdb"
+       log_test $? 2 "Fdb Nexthop with blackhole"
+
+       # fdb nexthop with oif
+       run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb"
+       log_test $? 2 "Fdb Nexthop with oif"
+
+       # fdb nexthop with onlink
+       run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb"
+       log_test $? 2 "Fdb Nexthop with onlink"
+
+       # fdb nexthop with encap
+       run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb"
+       log_test $? 2 "Fdb Nexthop with encap"
+
+       run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+       run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+       log_test $? 0 "Fdb mac add with nexthop group"
+
+       ## fdb nexthops can only reference nexthop groups and not nexthops
+       run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self"
+       log_test $? 255 "Fdb mac add with nexthop"
+
+       run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66"
+       log_test $? 2 "Route add with fdb nexthop"
+
+       run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103"
+       log_test $? 2 "Route add with fdb nexthop group"
+
+       run_cmd "$IP nexthop del id 102"
+       log_test $? 0 "Fdb nexthop delete"
+
+       $IP link del dev vx10
+}
+
+ipv4_fdb_grp_fcnal()
+{
+       local rc
+
+       echo
+       echo "IPv4 fdb groups functional"
+       echo "--------------------------"
+
+       check_nexthop_fdb_support
+       if [ $? -eq $ksft_skip ]; then
+               return $ksft_skip
+       fi
+
+       # create group with multiple nexthops
+       run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb"
+       run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb"
+       run_cmd "$IP nexthop add id 102 group 12/13 fdb"
+       check_nexthop "id 102" "id 102 group 12/13 fdb"
+       log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+       # get nexthop group
+       run_cmd "$IP nexthop get id 102"
+       check_nexthop "id 102" "id 102 group 12/13 fdb"
+       log_test $? 0 "Get Fdb nexthop group by id"
+
+       # fdb nexthop group can only contain fdb nexthops
+       run_cmd "$IP nexthop add id 14 via 172.16.1.2"
+       run_cmd "$IP nexthop add id 15 via 172.16.1.3"
+       run_cmd "$IP nexthop add id 103 group 14/15 fdb"
+       log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+       # Non fdb nexthop group can not contain fdb nexthops
+       run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb"
+       run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb"
+       run_cmd "$IP nexthop add id 104 group 14/15"
+       log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+       # fdb nexthop cannot have blackhole
+       run_cmd "$IP nexthop add id 18 blackhole fdb"
+       log_test $? 2 "Fdb Nexthop with blackhole"
+
+       # fdb nexthop with oif
+       run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb"
+       log_test $? 2 "Fdb Nexthop with oif"
+
+       # fdb nexthop with onlink
+       run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb"
+       log_test $? 2 "Fdb Nexthop with onlink"
+
+       # fdb nexthop with encap
+       run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb"
+       log_test $? 2 "Fdb Nexthop with encap"
+
+       run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+       run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+       log_test $? 0 "Fdb mac add with nexthop group"
+
+       # fdb nexthops can only reference nexthop groups and not nexthops
+       run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self"
+       log_test $? 255 "Fdb mac add with nexthop"
+
+       run_cmd "$IP ro add 172.16.0.0/22 nhid 15"
+       log_test $? 2 "Route add with fdb nexthop"
+
+       run_cmd "$IP ro add 172.16.0.0/22 nhid 103"
+       log_test $? 2 "Route add with fdb nexthop group"
+
+       run_cmd "$IP nexthop del id 102"
+       log_test $? 0 "Fdb nexthop delete"
+
+       $IP link del dev vx10
+}
+
 ################################################################################
 # basic operations (add, delete, replace) on nexthops and nexthop groups
 #