From c3e85147d9067c9c19451ad36505bcf70eb470b9 Mon Sep 17 00:00:00 2001 From: Paul Boca Date: Mon, 6 Jun 2016 16:45:00 +0000 Subject: [PATCH] datapath-windows: Improved offloading on STT tunnel *Added OvsExtractLayers - populates only the layers field without unnecessary memory operations for flow part *If in STT header the flags are 0 then force packets checksums calculation on receive. *Ensure correct pseudo checksum is set for LSO both on send and receive. Linux includes the segment length to TCP pseudo-checksum conforming to RFC 793 but in case of LSO Windows expects this to be only on Source IP Address, Destination IP Address, and Protocol. *Fragment expiration on rx side of STT was set to 30 seconds, but the correct timeout would be TTL of the packet Signed-off-by: Paul-Daniel Boca Acked-by: Sairam Venugopal Signed-off-by: Ben Pfaff --- datapath-windows/ovsext/Flow.c | 243 +++++++++++++++++++++---- datapath-windows/ovsext/Flow.h | 2 + datapath-windows/ovsext/PacketParser.c | 97 +++++----- datapath-windows/ovsext/PacketParser.h | 8 +- datapath-windows/ovsext/Stt.c | 124 ++++++++++--- datapath-windows/ovsext/User.c | 17 +- 6 files changed, 377 insertions(+), 114 deletions(-) diff --git a/datapath-windows/ovsext/Flow.c b/datapath-windows/ovsext/Flow.c index c2e02277a..2a918551a 100644 --- a/datapath-windows/ovsext/Flow.c +++ b/datapath-windows/ovsext/Flow.c @@ -1570,7 +1570,8 @@ _MapKeyAttrToFlowPut(PNL_ATTR *keyAttrs, ndKey = NlAttrGet(keyAttrs[OVS_KEY_ATTR_ND]); RtlCopyMemory(&icmp6FlowPutKey->ndTarget, - ndKey->nd_target, sizeof (icmp6FlowPutKey->ndTarget)); + ndKey->nd_target, + sizeof (icmp6FlowPutKey->ndTarget)); RtlCopyMemory(icmp6FlowPutKey->arpSha, ndKey->nd_sll, ETH_ADDR_LEN); RtlCopyMemory(icmp6FlowPutKey->arpTha, @@ -1600,8 +1601,10 @@ _MapKeyAttrToFlowPut(PNL_ATTR *keyAttrs, arpFlowPutKey->nwSrc = arpKey->arp_sip; arpFlowPutKey->nwDst = arpKey->arp_tip; - RtlCopyMemory(arpFlowPutKey->arpSha, arpKey->arp_sha, ETH_ADDR_LEN); - RtlCopyMemory(arpFlowPutKey->arpTha, arpKey->arp_tha, ETH_ADDR_LEN); + RtlCopyMemory(arpFlowPutKey->arpSha, arpKey->arp_sha, + ETH_ADDR_LEN); + RtlCopyMemory(arpFlowPutKey->arpTha, arpKey->arp_tha, + ETH_ADDR_LEN); /* Kernel datapath assumes 'arpFlowPutKey->nwProto' to be in host * order. */ arpFlowPutKey->nwProto = (UINT8)ntohs((arpKey->arp_op)); @@ -1850,29 +1853,195 @@ OvsGetFlowMetadata(OvsFlowKey *key, return status; } + /* - *---------------------------------------------------------------------------- - * Initializes 'flow' members from 'packet', 'skb_priority', 'tun_id', and - * 'ofp_in_port'. - * - * Initializes 'packet' header pointers as follows: - * - * - packet->l2 to the start of the Ethernet header. - * - * - packet->l3 to just past the Ethernet header, or just past the - * vlan_header if one is present, to the first byte of the payload of the - * Ethernet frame. - * - * - packet->l4 to just past the IPv4 header, if one is present and has a - * correct length, and otherwise NULL. - * - * - packet->l7 to just past the TCP, UDP, SCTP or ICMP header, if one is - * present and has a correct length, and otherwise NULL. - * - * Returns NDIS_STATUS_SUCCESS normally. Fails only if packet data cannot be accessed - * (e.g. if Pkt_CopyBytesOut() returns an error). - *---------------------------------------------------------------------------- - */ +*---------------------------------------------------------------------------- +* Initializes 'layers' members from 'packet' +* +* Initializes 'layers' header pointers as follows: +* +* - layers->l2 to the start of the Ethernet header. +* +* - layers->l3 to just past the Ethernet header, or just past the +* vlan_header if one is present, to the first byte of the payload of the +* Ethernet frame. +* +* - layers->l4 to just past the IPv4 header, if one is present and has a +* correct length, and otherwise NULL. +* +* - layers->l7 to just past the TCP, UDP, SCTP or ICMP header, if one is +* present and has a correct length, and otherwise NULL. +* +* - layers->isIPv4/isIPv6/isTcp/isUdp/isSctp based on the packet type +* +* Returns NDIS_STATUS_SUCCESS normally. +* Fails only if packet data cannot be accessed. +* (e.g. if OvsParseIPv6() returns an error). +*---------------------------------------------------------------------------- +*/ +NDIS_STATUS +OvsExtractLayers(const NET_BUFFER_LIST *packet, + POVS_PACKET_HDR_INFO layers) +{ + struct Eth_Header *eth; + UINT8 offset = 0; + PVOID vlanTagValue; + ovs_be16 dlType; + + layers->value = 0; + + /* Link layer. */ + eth = (Eth_Header *)GetStartAddrNBL((NET_BUFFER_LIST *)packet); + + /* + * vlan_tci. + */ + vlanTagValue = NET_BUFFER_LIST_INFO(packet, Ieee8021QNetBufferListInfo); + if (!vlanTagValue) { + if (eth->dix.typeNBO == ETH_TYPE_802_1PQ_NBO) { + offset = sizeof(Eth_802_1pq_Tag); + } + + /* + * XXX Please note after this point, src mac and dst mac should + * not be accessed through eth + */ + eth = (Eth_Header *)((UINT8 *)eth + offset); + } + + /* + * dl_type. + * + * XXX assume that at least the first + * 12 bytes of received packets are mapped. This code has the stronger + * assumption that at least the first 22 bytes of 'packet' is mapped (if my + * arithmetic is right). + */ + if (ETH_TYPENOT8023(eth->dix.typeNBO)) { + dlType = eth->dix.typeNBO; + layers->l3Offset = ETH_HEADER_LEN_DIX + offset; + } else if (OvsPacketLenNBL(packet) >= ETH_HEADER_LEN_802_3 && + eth->e802_3.llc.dsap == 0xaa && + eth->e802_3.llc.ssap == 0xaa && + eth->e802_3.llc.control == ETH_LLC_CONTROL_UFRAME && + eth->e802_3.snap.snapOrg[0] == 0x00 && + eth->e802_3.snap.snapOrg[1] == 0x00 && + eth->e802_3.snap.snapOrg[2] == 0x00) { + dlType = eth->e802_3.snap.snapType.typeNBO; + layers->l3Offset = ETH_HEADER_LEN_802_3 + offset; + } else { + dlType = htons(OVSWIN_DL_TYPE_NONE); + layers->l3Offset = ETH_HEADER_LEN_DIX + offset; + } + + /* Network layer. */ + if (dlType == htons(ETH_TYPE_IPV4)) { + struct IPHdr ip_storage; + const struct IPHdr *nh; + + layers->isIPv4 = 1; + nh = OvsGetIp(packet, layers->l3Offset, &ip_storage); + if (nh) { + layers->l4Offset = layers->l3Offset + nh->ihl * 4; + + if (!(nh->frag_off & htons(IP_OFFSET))) { + if (nh->protocol == SOCKET_IPPROTO_TCP) { + OvsParseTcp(packet, NULL, layers); + } else if (nh->protocol == SOCKET_IPPROTO_UDP) { + OvsParseUdp(packet, NULL, layers); + } else if (nh->protocol == SOCKET_IPPROTO_SCTP) { + OvsParseSctp(packet, NULL, layers); + } else if (nh->protocol == SOCKET_IPPROTO_ICMP) { + ICMPHdr icmpStorage; + const ICMPHdr *icmp; + + icmp = OvsGetIcmp(packet, layers->l4Offset, &icmpStorage); + if (icmp) { + layers->l7Offset = layers->l4Offset + sizeof *icmp; + } + } + } + } + } else if (dlType == htons(ETH_TYPE_IPV6)) { + NDIS_STATUS status; + Ipv6Key ipv6Key; + + status = OvsParseIPv6(packet, &ipv6Key, layers); + if (status != NDIS_STATUS_SUCCESS) { + return status; + } + layers->isIPv6 = 1; + + if (ipv6Key.nwProto == SOCKET_IPPROTO_TCP) { + OvsParseTcp(packet, &(ipv6Key.l4), layers); + } else if (ipv6Key.nwProto == SOCKET_IPPROTO_UDP) { + OvsParseUdp(packet, &(ipv6Key.l4), layers); + } else if (ipv6Key.nwProto == SOCKET_IPPROTO_SCTP) { + OvsParseSctp(packet, &ipv6Key.l4, layers); + } else if (ipv6Key.nwProto == SOCKET_IPPROTO_ICMPV6) { + Icmp6Key icmp6Key; + OvsParseIcmpV6(packet, NULL, &icmp6Key, layers); + } + } else if (OvsEthertypeIsMpls(dlType)) { + MPLSHdr mplsStorage; + const MPLSHdr *mpls; + + /* + * In the presence of an MPLS label stack the end of the L2 + * header and the beginning of the L3 header differ. + * + * A network packet may contain multiple MPLS labels, but we + * are only interested in the topmost label stack entry. + * + * Advance network header to the beginning of the L3 header. + * layers->l3Offset corresponds to the end of the L2 header. + */ + for (UINT32 i = 0; i < FLOW_MAX_MPLS_LABELS; i++) { + mpls = OvsGetMpls(packet, layers->l3Offset, &mplsStorage); + if (!mpls) { + break; + } + + layers->l3Offset += MPLS_HLEN; + layers->l4Offset += MPLS_HLEN; + + if (mpls->lse & htonl(MPLS_BOS_MASK)) { + /* + * Bottom of Stack bit is set, which means there are no + * remaining MPLS labels in the packet. + */ + break; + } + } + } + + return NDIS_STATUS_SUCCESS; +} + +/* +*---------------------------------------------------------------------------- +* Initializes 'flow' members from 'packet', 'skb_priority', 'tun_id', and +* 'ofp_in_port'. +* +* Initializes 'packet' header pointers as follows: +* +* - packet->l2 to the start of the Ethernet header. +* +* - packet->l3 to just past the Ethernet header, or just past the +* vlan_header if one is present, to the first byte of the payload of the +* Ethernet frame. +* +* - packet->l4 to just past the IPv4 header, if one is present and has a +* correct length, and otherwise NULL. +* +* - packet->l7 to just past the TCP, UDP, SCTP or ICMP header, if one is +* present and has a correct length, and otherwise NULL. +* +* Returns NDIS_STATUS_SUCCESS normally. +* Fails only if packet data cannot be accessed. +* (e.g. if Pkt_CopyBytesOut() returns an error). +*---------------------------------------------------------------------------- +*/ NDIS_STATUS OvsExtractFlow(const NET_BUFFER_LIST *packet, UINT32 inPort, @@ -1904,8 +2073,8 @@ OvsExtractFlow(const NET_BUFFER_LIST *packet, /* Link layer. */ eth = (Eth_Header *)GetStartAddrNBL((NET_BUFFER_LIST *)packet); - memcpy(flow->l2.dlSrc, eth->src, ETH_ADDR_LENGTH); - memcpy(flow->l2.dlDst, eth->dst, ETH_ADDR_LENGTH); + RtlCopyMemory(flow->l2.dlSrc, eth->src, ETH_ADDR_LENGTH); + RtlCopyMemory(flow->l2.dlDst, eth->dst, ETH_ADDR_LENGTH); /* * vlan_tci. @@ -1927,8 +2096,7 @@ OvsExtractFlow(const NET_BUFFER_LIST *packet, flow->l2.vlanTci = 0; } /* - * XXX - * Please note after this point, src mac and dst mac should + * XXX Please note after this point, src mac and dst mac should * not be accessed through eth */ eth = (Eth_Header *)((UINT8 *)eth + offset); @@ -1959,7 +2127,8 @@ OvsExtractFlow(const NET_BUFFER_LIST *packet, layers->l3Offset = ETH_HEADER_LEN_DIX + offset; } - flow->l2.keyLen = OVS_WIN_TUNNEL_KEY_SIZE + OVS_L2_KEY_SIZE - flow->l2.offset; + flow->l2.keyLen = OVS_WIN_TUNNEL_KEY_SIZE + OVS_L2_KEY_SIZE + - flow->l2.offset; /* Network layer. */ if (flow->l2.dlType == htons(ETH_TYPE_IPV4)) { struct IPHdr ip_storage; @@ -2016,9 +2185,9 @@ OvsExtractFlow(const NET_BUFFER_LIST *packet, } else if (flow->l2.dlType == htons(ETH_TYPE_IPV6)) { NDIS_STATUS status; flow->l2.keyLen += OVS_IPV6_KEY_SIZE; - status = OvsParseIPv6(packet, flow, layers); + status = OvsParseIPv6(packet, &flow->ipv6Key, layers); if (status != NDIS_STATUS_SUCCESS) { - memset(&flow->ipv6Key, 0, sizeof (Ipv6Key)); + RtlZeroMemory(&flow->ipv6Key, sizeof (Ipv6Key)); return status; } layers->isIPv6 = 1; @@ -2033,7 +2202,7 @@ OvsExtractFlow(const NET_BUFFER_LIST *packet, } else if (flow->ipv6Key.nwProto == SOCKET_IPPROTO_SCTP) { OvsParseSctp(packet, &flow->ipv6Key.l4, layers); } else if (flow->ipv6Key.nwProto == SOCKET_IPPROTO_ICMPV6) { - OvsParseIcmpV6(packet, flow, layers); + OvsParseIcmpV6(packet, &flow->ipv6Key, &flow->icmp6Key, layers); flow->l2.keyLen += (OVS_ICMPV6_KEY_SIZE - OVS_IPV6_KEY_SIZE); } } else if (flow->l2.dlType == htons(ETH_TYPE_ARP)) { @@ -2055,10 +2224,10 @@ OvsExtractFlow(const NET_BUFFER_LIST *packet, } if (arpKey->nwProto == ARPOP_REQUEST || arpKey->nwProto == ARPOP_REPLY) { - memcpy(&arpKey->nwSrc, arp->arp_spa, 4); - memcpy(&arpKey->nwDst, arp->arp_tpa, 4); - memcpy(arpKey->arpSha, arp->arp_sha, ETH_ADDR_LENGTH); - memcpy(arpKey->arpTha, arp->arp_tha, ETH_ADDR_LENGTH); + RtlCopyMemory(&arpKey->nwSrc, arp->arp_spa, 4); + RtlCopyMemory(&arpKey->nwDst, arp->arp_tpa, 4); + RtlCopyMemory(arpKey->arpSha, arp->arp_sha, ETH_ADDR_LENGTH); + RtlCopyMemory(arpKey->arpTha, arp->arp_tha, ETH_ADDR_LENGTH); } } } else if (OvsEthertypeIsMpls(flow->l2.dlType)) { diff --git a/datapath-windows/ovsext/Flow.h b/datapath-windows/ovsext/Flow.h index fb3fb5984..d39db453c 100644 --- a/datapath-windows/ovsext/Flow.h +++ b/datapath-windows/ovsext/Flow.h @@ -53,6 +53,8 @@ NDIS_STATUS OvsAllocateFlowTable(OVS_DATAPATH *datapath, NDIS_STATUS OvsGetFlowMetadata(OvsFlowKey *key, PNL_ATTR *keyAttrs); +NDIS_STATUS OvsExtractLayers(const NET_BUFFER_LIST *packet, + POVS_PACKET_HDR_INFO layers); NDIS_STATUS OvsExtractFlow(const NET_BUFFER_LIST *pkt, UINT32 inPort, OvsFlowKey *flow, POVS_PACKET_HDR_INFO layers, OvsIPv4TunnelKey *tunKey); diff --git a/datapath-windows/ovsext/PacketParser.c b/datapath-windows/ovsext/PacketParser.c index 93df3424d..c4a04d072 100644 --- a/datapath-windows/ovsext/PacketParser.c +++ b/datapath-windows/ovsext/PacketParser.c @@ -84,14 +84,13 @@ OvsGetPacketBytes(const NET_BUFFER_LIST *nbl, NDIS_STATUS OvsParseIPv6(const NET_BUFFER_LIST *packet, - OvsFlowKey *key, + Ipv6Key *ipv6Key, POVS_PACKET_HDR_INFO layers) { UINT16 ofs = layers->l3Offset; IPv6Hdr ipv6HdrStorage; const IPv6Hdr *nh; UINT32 nextHdr; - Ipv6Key *flow= &key->ipv6Key; nh = OvsGetPacketBytes(packet, sizeof *nh, ofs, &ipv6HdrStorage); if (!nh) { @@ -99,15 +98,15 @@ OvsParseIPv6(const NET_BUFFER_LIST *packet, } nextHdr = nh->nexthdr; - memcpy(&flow->ipv6Src, nh->saddr.s6_addr, 16); - memcpy(&flow->ipv6Dst, nh->daddr.s6_addr, 16); + RtlCopyMemory(&ipv6Key->ipv6Src, nh->saddr.s6_addr, 16); + RtlCopyMemory(&ipv6Key->ipv6Dst, nh->daddr.s6_addr, 16); - flow->nwTos = ((nh->flow_lbl[0] & 0xF0) >> 4) | (nh->priority << 4); - flow->ipv6Label = + ipv6Key->nwTos = ((nh->flow_lbl[0] & 0xF0) >> 4) | (nh->priority << 4); + ipv6Key->ipv6Label = ((nh->flow_lbl[0] & 0x0F) << 16) | (nh->flow_lbl[1] << 8) | nh->flow_lbl[2]; - flow->nwTtl = nh->hop_limit; - flow->nwProto = SOCKET_IPPROTO_NONE; - flow->nwFrag = OVS_FRAG_TYPE_NONE; + ipv6Key->nwTtl = nh->hop_limit; + ipv6Key->nwProto = SOCKET_IPPROTO_NONE; + ipv6Key->nwFrag = OVS_FRAG_TYPE_NONE; // Parse extended headers and compute L4 offset ofs += sizeof(IPv6Hdr); @@ -160,9 +159,9 @@ OvsParseIPv6(const NET_BUFFER_LIST *packet, /* We only process the first fragment. */ if (fragHdr->offlg != htons(0)) { if ((fragHdr->offlg & IP6F_OFF_HOST_ORDER_MASK) == htons(0)) { - flow->nwFrag = OVS_FRAG_TYPE_FIRST; + ipv6Key->nwFrag = OVS_FRAG_TYPE_FIRST; } else { - flow->nwFrag = OVS_FRAG_TYPE_LATER; + ipv6Key->nwFrag = OVS_FRAG_TYPE_LATER; nextHdr = SOCKET_IPPROTO_FRAGMENT; break; } @@ -170,7 +169,7 @@ OvsParseIPv6(const NET_BUFFER_LIST *packet, } } - flow->nwProto = (UINT8)nextHdr; + ipv6Key->nwProto = (UINT8)nextHdr; layers->l4Offset = ofs; return NDIS_STATUS_SUCCESS; } @@ -183,10 +182,14 @@ OvsParseTcp(const NET_BUFFER_LIST *packet, TCPHdr tcpStorage; const TCPHdr *tcp = OvsGetTcp(packet, layers->l4Offset, &tcpStorage); if (tcp) { - flow->tpSrc = tcp->source; - flow->tpDst = tcp->dest; - layers->isTcp = 1; - layers->l7Offset = layers->l4Offset + 4 * tcp->doff; + if (flow) { + flow->tpSrc = tcp->source; + flow->tpDst = tcp->dest; + } + if (layers) { + layers->isTcp = 1; + layers->l7Offset = layers->l4Offset + 4 * tcp->doff; + } } } @@ -198,10 +201,14 @@ OvsParseSctp(const NET_BUFFER_LIST *packet, SCTPHdr sctpStorage; const SCTPHdr *sctp = OvsGetSctp(packet, layers->l4Offset, &sctpStorage); if (sctp) { - flow->tpSrc = sctp->source; - flow->tpDst = sctp->dest; - layers->isSctp = 1; - layers->l7Offset = layers->l4Offset + sizeof *sctp; + if (flow) { + flow->tpSrc = sctp->source; + flow->tpDst = sctp->dest; + } + if (layers) { + layers->isSctp = 1; + layers->l7Offset = layers->l4Offset + sizeof *sctp; + } } } @@ -213,29 +220,33 @@ OvsParseUdp(const NET_BUFFER_LIST *packet, UDPHdr udpStorage; const UDPHdr *udp = OvsGetUdp(packet, layers->l4Offset, &udpStorage); if (udp) { - flow->tpSrc = udp->source; - flow->tpDst = udp->dest; - layers->isUdp = 1; - if (udp->check == 0) { - layers->udpCsumZero = 1; + if (flow) { + flow->tpSrc = udp->source; + flow->tpDst = udp->dest; + } + if (layers) { + layers->isUdp = 1; + if (udp->check == 0) { + layers->udpCsumZero = 1; + } + layers->l7Offset = layers->l4Offset + sizeof *udp; } - layers->l7Offset = layers->l4Offset + sizeof *udp; } } NDIS_STATUS OvsParseIcmpV6(const NET_BUFFER_LIST *packet, - OvsFlowKey *key, - POVS_PACKET_HDR_INFO layers) + Ipv6Key *ipv6Key, + Icmp6Key *icmp6Key, + POVS_PACKET_HDR_INFO layers) { UINT16 ofs = layers->l4Offset; ICMPHdr icmpStorage; const ICMPHdr *icmp; - Icmp6Key *flow = &key->icmp6Key; - memset(&flow->ndTarget, 0, sizeof(flow->ndTarget)); - memset(flow->arpSha, 0, sizeof(flow->arpSha)); - memset(flow->arpTha, 0, sizeof(flow->arpTha)); + memset(&icmp6Key->ndTarget, 0, sizeof(icmp6Key->ndTarget)); + memset(icmp6Key->arpSha, 0, sizeof(icmp6Key->arpSha)); + memset(icmp6Key->arpTha, 0, sizeof(icmp6Key->arpTha)); icmp = OvsGetIcmp(packet, ofs, &icmpStorage); if (!icmp) { @@ -247,8 +258,10 @@ OvsParseIcmpV6(const NET_BUFFER_LIST *packet, * The ICMPv6 type and code fields use the 16-bit transport port * fields, so we need to store them in 16-bit network byte order. */ - key->ipv6Key.l4.tpSrc = htons(icmp->type); - key->ipv6Key.l4.tpDst = htons(icmp->code); + if (ipv6Key) { + ipv6Key->l4.tpSrc = htons(icmp->type); + ipv6Key->l4.tpDst = htons(icmp->code); + } if (icmp->code == 0 && (icmp->type == ND_NEIGHBOR_SOLICIT || @@ -261,7 +274,7 @@ OvsParseIcmpV6(const NET_BUFFER_LIST *packet, if (!ndTarget) { return NDIS_STATUS_FAILURE; } - flow->ndTarget = *ndTarget; + icmp6Key->ndTarget = *ndTarget; while ((UINT32)(ofs + 8) <= OvsPacketLenNBL(packet)) { /* @@ -288,14 +301,14 @@ OvsParseIcmpV6(const NET_BUFFER_LIST *packet, * layer option is specified twice. */ if (ndOpt->type == ND_OPT_SOURCE_LINKADDR && optLen == 8) { - if (Eth_IsNullAddr(flow->arpSha)) { - memcpy(flow->arpSha, ndOpt + 1, ETH_ADDR_LENGTH); + if (Eth_IsNullAddr(icmp6Key->arpSha)) { + memcpy(icmp6Key->arpSha, ndOpt + 1, ETH_ADDR_LENGTH); } else { goto invalid; } } else if (ndOpt->type == ND_OPT_TARGET_LINKADDR && optLen == 8) { - if (Eth_IsNullAddr(flow->arpTha)) { - memcpy(flow->arpTha, ndOpt + 1, ETH_ADDR_LENGTH); + if (Eth_IsNullAddr(icmp6Key->arpTha)) { + memcpy(icmp6Key->arpTha, ndOpt + 1, ETH_ADDR_LENGTH); } else { goto invalid; } @@ -309,9 +322,9 @@ OvsParseIcmpV6(const NET_BUFFER_LIST *packet, return NDIS_STATUS_SUCCESS; invalid: - memset(&flow->ndTarget, 0, sizeof(flow->ndTarget)); - memset(flow->arpSha, 0, sizeof(flow->arpSha)); - memset(flow->arpTha, 0, sizeof(flow->arpTha)); + RtlZeroMemory(&icmp6Key->ndTarget, sizeof(icmp6Key->ndTarget)); + RtlZeroMemory(icmp6Key->arpSha, sizeof(icmp6Key->arpSha)); + RtlZeroMemory(icmp6Key->arpTha, sizeof(icmp6Key->arpTha)); return NDIS_STATUS_FAILURE; } diff --git a/datapath-windows/ovsext/PacketParser.h b/datapath-windows/ovsext/PacketParser.h index 47d227f59..f1d7f283d 100644 --- a/datapath-windows/ovsext/PacketParser.h +++ b/datapath-windows/ovsext/PacketParser.h @@ -22,7 +22,7 @@ const VOID* OvsGetPacketBytes(const NET_BUFFER_LIST *_pNB, UINT32 len, UINT32 SrcOffset, VOID *storage); -NDIS_STATUS OvsParseIPv6(const NET_BUFFER_LIST *packet, OvsFlowKey *key, +NDIS_STATUS OvsParseIPv6(const NET_BUFFER_LIST *packet, Ipv6Key *key, POVS_PACKET_HDR_INFO layers); VOID OvsParseTcp(const NET_BUFFER_LIST *packet, L4Key *flow, POVS_PACKET_HDR_INFO layers); @@ -30,8 +30,10 @@ VOID OvsParseUdp(const NET_BUFFER_LIST *packet, L4Key *flow, POVS_PACKET_HDR_INFO layers); VOID OvsParseSctp(const NET_BUFFER_LIST *packet, L4Key *flow, POVS_PACKET_HDR_INFO layers); -NDIS_STATUS OvsParseIcmpV6(const NET_BUFFER_LIST *packet, OvsFlowKey *key, - POVS_PACKET_HDR_INFO layers); +NDIS_STATUS OvsParseIcmpV6(const NET_BUFFER_LIST *packet, + Ipv6Key *ipv6Key, + Icmp6Key *flow, + POVS_PACKET_HDR_INFO layers); static __inline ULONG OvsPacketLenNBL(const NET_BUFFER_LIST *_pNB) diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c index dd7bf9279..c93db7548 100644 --- a/datapath-windows/ovsext/Stt.c +++ b/datapath-windows/ovsext/Stt.c @@ -194,7 +194,7 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, if (layers->isIPv4) { IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset); if (!ip->tot_len) { - ip->tot_len = htons(innerFrameLen - sizeof(EthHdr)); + ip->tot_len = htons(innerFrameLen - layers->l3Offset); } if (!ip->check) { ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); @@ -231,8 +231,8 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, * memory. */ curMdl = NET_BUFFER_CURRENT_MDL(curNb); - ASSERT((int) (MmGetMdlByteCount(curMdl) - NET_BUFFER_CURRENT_MDL_OFFSET(curNb)) - >= (int) headRoom); + ASSERT((int) (MmGetMdlByteCount(curMdl) - + NET_BUFFER_CURRENT_MDL_OFFSET(curNb)) >= (int) headRoom); buf = (PUINT8) MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority); if (!buf) { @@ -288,12 +288,12 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, /* Calculate pseudo header chksum */ tcpChksumLen = sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen; ASSERT(tcpChksumLen < 65535); - outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr,(uint32 *) &tunKey->dst, - IPPROTO_TCP, (uint16) tcpChksumLen); sttHdr->version = 0; /* Set STT Header */ sttHdr->flags = 0; + sttHdr->mss = 0; + sttHdr->l4Offset = 0; if (innerPartialChecksum) { sttHdr->flags |= STT_CSUM_PARTIAL; if (layers->isIPv4) { @@ -327,8 +327,22 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value; - UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - sizeof(TCPHdr); + UINT32 encapMss = OvsGetExternalMtu(switchContext) + - sizeof(IPHdr) + - sizeof(TCPHdr); if (ipTotalLen > encapMss) { + /* For Windows LSO, the TCP pseudo checksum must contain Source IP + * Address, Destination IP Address, and Protocol; the length of the + * payload is excluded because the underlying miniport driver and NIC + * generate TCP segments from the large packet that is passed down by + * the TCP/IP transport, the transport does not know the size of the + * TCP payload for each TCP segment and therefore cannot include the + * TCP Length in the pseudo-header. + */ + outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr, + (uint32 *) &tunKey->dst, + IPPROTO_TCP, (uint16) 0); + lsoInfo.Value = 0; lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset; lsoInfo.LsoV2Transmit.MSS = encapMss; @@ -336,6 +350,11 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; NET_BUFFER_LIST_INFO(curNbl, TcpLargeSendNetBufferListInfo) = lsoInfo.Value; + } else { + outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr, + (uint32 *) &tunKey->dst, + IPPROTO_TCP, + (uint16) tcpChksumLen); } return STATUS_SUCCESS; @@ -655,7 +674,8 @@ handle_error: if (lastPacket) { /* Retrieve the original STT header */ NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof (SttHdr)); - targetPNbl = OvsAllocateNBLFromBuffer(switchContext, pktFragEntry->packetBuf, + targetPNbl = OvsAllocateNBLFromBuffer(switchContext, + pktFragEntry->packetBuf, innerPacketLen); /* Delete this entry and free up the memory/ */ @@ -668,16 +688,32 @@ handle_error: return lastPacket ? targetPNbl : NULL; } -VOID -OvsDecapSetOffloads(PNET_BUFFER_LIST curNbl, SttHdr *sttHdr) + +/* +*---------------------------------------------------------------------------- +* OvsDecapSetOffloads +* Processes received STT header and sets TcpIpChecksumNetBufferListInfo +* accordingly. +* For TCP packets with total length bigger than destination MSS it +* populates TcpLargeSendNetBufferListInfo. +* +* Returns NDIS_STATUS_SUCCESS normally. +* Fails only if packet data is invalid. +* (e.g. if OvsExtractLayers() returns an error). +*---------------------------------------------------------------------------- +*/ +NDIS_STATUS +OvsDecapSetOffloads(PNET_BUFFER_LIST *curNbl, SttHdr *sttHdr) { if ((sttHdr->flags & STT_CSUM_VERIFIED) || !(sttHdr->flags & STT_CSUM_PARTIAL)) { - return; + return NDIS_STATUS_SUCCESS; } - UINT8 protoType; + NDIS_STATUS status; NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; + UINT8 protoType; + csumInfo.Value = 0; csumInfo.Transmit.IpHeaderChecksum = 0; csumInfo.Transmit.TcpHeaderOffset = sttHdr->l4Offset; @@ -703,25 +739,66 @@ OvsDecapSetOffloads(PNET_BUFFER_LIST curNbl, SttHdr *sttHdr) csumInfo.Transmit.IsIPv6 = 1; csumInfo.Transmit.UdpChecksum = 1; } - NET_BUFFER_LIST_INFO(curNbl, + NET_BUFFER_LIST_INFO(*curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value; - if (sttHdr->mss) { + if (sttHdr->mss && (sttHdr->flags & STT_PROTO_TCP)) { NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; + PMDL curMdl = NULL; + PNET_BUFFER curNb; + PUINT8 buf = NULL; + OVS_PACKET_HDR_INFO layers; + + status = OvsExtractLayers(*curNbl, &layers); + if (status != NDIS_STATUS_SUCCESS) { + return status; + } + + curNb = NET_BUFFER_LIST_FIRST_NB(*curNbl); + curMdl = NET_BUFFER_CURRENT_MDL(curNb); + + buf = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, + LowPagePriority); + buf += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); + + // apply pseudo checksum on extracted packet + if (sttHdr->flags & STT_PROTO_IPV4) { + IPHdr *ipHdr; + TCPHdr *tcpHdr; + + ipHdr = (IPHdr *)(buf + layers.l3Offset); + tcpHdr = (TCPHdr *)(buf + layers.l4Offset); + + tcpHdr->check = IPPseudoChecksum(&ipHdr->saddr, + (uint32 *)&ipHdr->daddr, + IPPROTO_TCP, 0); + } else { + IPv6Hdr *ipHdr; + TCPHdr *tcpHdr; + + ipHdr = (IPv6Hdr *)(buf + layers.l3Offset); + tcpHdr = (TCPHdr *)(buf + layers.l4Offset); + + tcpHdr->check = IPv6PseudoChecksum((UINT32*)&ipHdr->saddr, + (UINT32*)&ipHdr->daddr, + IPPROTO_TCP, 0); + } + + // setup LSO lsoInfo.Value = 0; lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset; - lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU - - sizeof(IPHdr) - - sizeof(TCPHdr); + lsoInfo.LsoV2Transmit.MSS = ntohs(sttHdr->mss); lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; if (sttHdr->flags & STT_PROTO_IPV4) { lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; } else { lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6; } - NET_BUFFER_LIST_INFO(curNbl, + NET_BUFFER_LIST_INFO(*curNbl, TcpLargeSendNetBufferListInfo) = lsoInfo.Value; } + + return NDIS_STATUS_SUCCESS; } /* @@ -736,15 +813,14 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, OvsIPv4TunnelKey *tunKey, PNET_BUFFER_LIST *newNbl) { - NDIS_STATUS status = NDIS_STATUS_FAILURE; - PNET_BUFFER curNb, newNb; + NDIS_STATUS status; + PNET_BUFFER curNb; IPHdr *ipHdr; char *ipBuf[sizeof(IPHdr)]; SttHdr stt; SttHdr *sttHdr; char *sttBuf[STT_HDR_LEN]; UINT32 advanceCnt, hdrLen; - BOOLEAN isLsoPacket = FALSE; curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); @@ -767,7 +843,7 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); /* Skip IP & TCP headers */ - hdrLen = sizeof(IPHdr) + sizeof(TCPHdr), + hdrLen = (ipHdr->ihl * 4) + (tcp->doff * 4); NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); advanceCnt += hdrLen; @@ -775,7 +851,7 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT); UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len) - (ipHdr->ihl * 4) - - (sizeof * tcp); + - (tcp->doff * 4); /* Check if incoming packet requires reassembly */ if (totalLen != payloadLen) { @@ -788,7 +864,6 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, } *newNbl = pNbl; - isLsoPacket = TRUE; } else { /* STT Header */ sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, @@ -812,7 +887,6 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, OvsCompleteNBL(switchContext, *newNbl, TRUE); return NDIS_STATUS_FAILURE; } - newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl); ASSERT(sttHdr); @@ -826,7 +900,7 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, tunKey->pad = 0; /* Set Checksum and LSO offload flags */ - OvsDecapSetOffloads(*newNbl, sttHdr); + OvsDecapSetOffloads(newNbl, sttHdr); return NDIS_STATUS_SUCCESS; } diff --git a/datapath-windows/ovsext/User.c b/datapath-windows/ovsext/User.c index 92a71e171..c7ac28456 100644 --- a/datapath-windows/ovsext/User.c +++ b/datapath-windows/ovsext/User.c @@ -768,7 +768,8 @@ OvsCreateAndAddPackets(PVOID userData, NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo; UINT32 packetLength; - tsoInfo.Value = NET_BUFFER_LIST_INFO(nbl, TcpLargeSendNetBufferListInfo); + tsoInfo.Value = NET_BUFFER_LIST_INFO(nbl, + TcpLargeSendNetBufferListInfo); nb = NET_BUFFER_LIST_FIRST_NB(nbl); packetLength = NET_BUFFER_DATA_LENGTH(nb); @@ -870,7 +871,8 @@ OvsCompletePacketHeader(UINT8 *packet, (UINT32 *)&ipHdr->DestinationAddress, IPPROTO_TCP, hdrInfoOut->l4PayLoad); } else { - PIPV6_HEADER ipv6Hdr = (PIPV6_HEADER)(packet + hdrInfoIn->l3Offset); + PIPV6_HEADER ipv6Hdr = (PIPV6_HEADER)(packet + + hdrInfoIn->l3Offset); hdrInfoOut->l4PayLoad = (UINT16)(ntohs(ipv6Hdr->PayloadLength) + hdrInfoIn->l3Offset + sizeof(IPV6_HEADER)- @@ -884,9 +886,9 @@ OvsCompletePacketHeader(UINT8 *packet, hdrInfoOut->tcpCsumNeeded = 1; ovsUserStats.recalTcpCsum++; } else if (!isRecv) { - if (csumInfo.Transmit.TcpChecksum) { + if (hdrInfoIn->isTcp && csumInfo.Transmit.TcpChecksum) { hdrInfoOut->tcpCsumNeeded = 1; - } else if (csumInfo.Transmit.UdpChecksum) { + } else if (hdrInfoIn->isUdp && csumInfo.Transmit.UdpChecksum) { hdrInfoOut->udpCsumNeeded = 1; } if (hdrInfoOut->tcpCsumNeeded || hdrInfoOut->udpCsumNeeded) { @@ -896,7 +898,8 @@ OvsCompletePacketHeader(UINT8 *packet, hdrInfoOut->tcpCsumNeeded ? IPPROTO_TCP : IPPROTO_UDP; #endif if (hdrInfoIn->isIPv4) { - PIPV4_HEADER ipHdr = (PIPV4_HEADER)(packet + hdrInfoIn->l3Offset); + PIPV4_HEADER ipHdr = (PIPV4_HEADER)(packet + + hdrInfoIn->l3Offset); hdrInfoOut->l4PayLoad = (UINT16)(ntohs(ipHdr->TotalLength) - (ipHdr->HeaderLength << 2)); #ifdef DBG @@ -1004,8 +1007,8 @@ OvsCreateQueueNlPacket(PVOID userData, csumInfo.Value = NET_BUFFER_LIST_INFO(nbl, TcpIpChecksumNetBufferListInfo); if (isRecv && (csumInfo.Receive.TcpChecksumFailed || - (csumInfo.Receive.UdpChecksumFailed && !hdrInfo->udpCsumZero) || - csumInfo.Receive.IpChecksumFailed)) { + (csumInfo.Receive.UdpChecksumFailed && !hdrInfo->udpCsumZero) || + csumInfo.Receive.IpChecksumFailed)) { OVS_LOG_INFO("Packet dropped due to checksum failure."); ovsUserStats.dropDuetoChecksum++; return NULL; -- 2.39.2