2 * Copyright (c) 2014, 2016 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "PacketParser.h"
32 #pragma warning( push )
33 #pragma warning( disable:4127 )
39 #define OVS_DBG_MOD OVS_DBG_VXLAN
41 /* Helper macro to check if a VXLAN ID is valid. */
42 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
43 #define VXLAN_TUNNELID_TO_VNI(_tID) (UINT32)(((UINT64)(_tID)) >> 40)
44 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
45 #define IP_DF_NBO 0x0040
46 #define VXLAN_DEFAULT_TTL 64
47 #define VXLAN_MULTICAST_TTL 64
48 #define VXLAN_DEFAULT_INSTANCE_ID 1
50 /* Move to a header file */
51 extern POVS_SWITCH_CONTEXT gOvsSwitchContext
;
54 *----------------------------------------------------------------------------
55 * This function verifies if the VXLAN tunnel already exists, in order to
56 * avoid sending a duplicate request to the WFP base filtering engine.
57 *----------------------------------------------------------------------------
60 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext
,
63 for (UINT hash
= 0; hash
< OVS_MAX_VPORT_ARRAY_SIZE
; hash
++) {
64 PLIST_ENTRY head
, link
, next
;
66 head
= &(switchContext
->portNoHashArray
[hash
& OVS_VPORT_MASK
]);
67 LIST_FORALL_SAFE(head
, link
, next
) {
68 POVS_VPORT_ENTRY vport
= NULL
;
69 POVS_VXLAN_VPORT vxlanPort
= NULL
;
70 vport
= CONTAINING_RECORD(link
, OVS_VPORT_ENTRY
, portNoLink
);
71 vxlanPort
= (POVS_VXLAN_VPORT
)vport
->priv
;
73 if ((udpPortDest
== vxlanPort
->dstPort
)) {
74 /* The VXLAN tunnel was already created. */
85 *----------------------------------------------------------------------------
86 * This function allocates and initializes the OVS_VXLAN_VPORT. The function
87 * also creates a WFP tunnel filter for the necessary destination port. The
88 * tunnel filter create request is passed to the tunnel filter threads that
89 * will complete the request at a later time when IRQL is lowered to
92 * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
93 * port of an udp frame is udpDestPort, we understand it to be vxlan.
94 *----------------------------------------------------------------------------
97 OvsInitVxlanTunnel(PIRP irp
,
98 POVS_VPORT_ENTRY vport
,
100 PFNTunnelVportPendingOp callback
,
103 NTSTATUS status
= STATUS_SUCCESS
;
104 POVS_VXLAN_VPORT vxlanPort
= NULL
;
106 vxlanPort
= OvsAllocateMemoryWithTag(sizeof (*vxlanPort
),
108 if (vxlanPort
== NULL
) {
109 return STATUS_INSUFFICIENT_RESOURCES
;
112 RtlZeroMemory(vxlanPort
, sizeof(*vxlanPort
));
113 vxlanPort
->dstPort
= udpDestPort
;
114 vport
->priv
= (PVOID
)vxlanPort
;
116 if (!OvsIsTunnelFilterCreated(gOvsSwitchContext
, udpDestPort
)) {
117 status
= OvsTunnelFilterCreate(irp
,
119 &vxlanPort
->filterID
,
123 status
= STATUS_OBJECT_NAME_EXISTS
;
130 *----------------------------------------------------------------------------
131 * This function releases the OVS_VXLAN_VPORT. The function also deletes the
132 * WFP tunnel filter previously created. The tunnel filter delete request is
133 * passed to the tunnel filter threads that will complete the request at a
134 * later time when IRQL is lowered to PASSIVE_LEVEL.
135 *----------------------------------------------------------------------------
138 OvsCleanupVxlanTunnel(PIRP irp
,
139 POVS_VPORT_ENTRY vport
,
140 PFNTunnelVportPendingOp callback
,
143 NTSTATUS status
= STATUS_SUCCESS
;
144 POVS_VXLAN_VPORT vxlanPort
= NULL
;
146 if (vport
->ovsType
!= OVS_VPORT_TYPE_VXLAN
||
147 vport
->priv
== NULL
) {
148 return STATUS_SUCCESS
;
151 vxlanPort
= (POVS_VXLAN_VPORT
)vport
->priv
;
153 if (vxlanPort
->filterID
!= 0) {
154 status
= OvsTunnelFilterDelete(irp
,
159 OvsFreeMemoryWithTag(vport
->priv
, OVS_VXLAN_POOL_TAG
);
168 *----------------------------------------------------------------------------
170 * Encapsulates the packet.
171 *----------------------------------------------------------------------------
173 static __inline NDIS_STATUS
174 OvsDoEncapVxlan(POVS_VPORT_ENTRY vport
,
175 PNET_BUFFER_LIST curNbl
,
176 OvsIPv4TunnelKey
*tunKey
,
177 POVS_FWD_INFO fwdInfo
,
178 POVS_PACKET_HDR_INFO layers
,
179 POVS_SWITCH_CONTEXT switchContext
,
180 PNET_BUFFER_LIST
*newNbl
)
190 POVS_VXLAN_VPORT vportVxlan
;
191 UINT32 headRoom
= OvsGetVxlanTunHdrSize();
194 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo
;
196 curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
);
197 packetLength
= NET_BUFFER_DATA_LENGTH(curNb
);
200 mss
= OVSGetTcpMSS(curNbl
);
202 OVS_LOG_TRACE("MSS %u packet len %u", mss
,
205 OVS_LOG_TRACE("l4Offset %d", layers
->l4Offset
);
206 *newNbl
= OvsTcpSegmentNBL(switchContext
, curNbl
, layers
,
207 mss
, headRoom
, FALSE
);
208 if (*newNbl
== NULL
) {
209 OVS_LOG_ERROR("Unable to segment NBL");
210 return NDIS_STATUS_FAILURE
;
212 /* Clear out LSO flags after this point */
213 NET_BUFFER_LIST_INFO(*newNbl
, TcpLargeSendNetBufferListInfo
) = 0;
217 vportVxlan
= (POVS_VXLAN_VPORT
) GetOvsVportPriv(vport
);
220 /* If we didn't split the packet above, make a copy now */
221 if (*newNbl
== NULL
) {
222 *newNbl
= OvsPartialCopyNBL(switchContext
, curNbl
, 0, headRoom
,
224 if (*newNbl
== NULL
) {
225 OVS_LOG_ERROR("Unable to copy NBL");
226 return NDIS_STATUS_FAILURE
;
228 csumInfo
.Value
= NET_BUFFER_LIST_INFO(curNbl
,
229 TcpIpChecksumNetBufferListInfo
);
230 status
= OvsApplySWChecksumOnNB(layers
, *newNbl
, &csumInfo
);
232 if (status
!= NDIS_STATUS_SUCCESS
) {
238 for (curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
); curNb
!= NULL
;
239 curNb
= curNb
->Next
) {
240 status
= NdisRetreatNetBufferDataStart(curNb
, headRoom
, 0, NULL
);
241 if (status
!= NDIS_STATUS_SUCCESS
) {
245 curMdl
= NET_BUFFER_CURRENT_MDL(curNb
);
246 bufferStart
= (PUINT8
)OvsGetMdlWithLowPriority(curMdl
);
248 status
= NDIS_STATUS_RESOURCES
;
252 bufferStart
+= NET_BUFFER_CURRENT_MDL_OFFSET(curNb
);
253 if (NET_BUFFER_NEXT_NB(curNb
)) {
254 OVS_LOG_TRACE("nb length %u next %u",
255 NET_BUFFER_DATA_LENGTH(curNb
),
256 NET_BUFFER_DATA_LENGTH(curNb
->Next
));
260 ethHdr
= (EthHdr
*)bufferStart
;
261 NdisMoveMemory(ethHdr
->Destination
, fwdInfo
->dstMacAddr
,
262 sizeof ethHdr
->Destination
);
263 NdisMoveMemory(ethHdr
->Source
, fwdInfo
->srcMacAddr
,
264 sizeof ethHdr
->Source
);
265 ethHdr
->Type
= htons(ETH_TYPE_IPV4
);
268 ipHdr
= (IPHdr
*)((PCHAR
)ethHdr
+ sizeof *ethHdr
);
270 ipHdr
->ihl
= sizeof *ipHdr
/ 4;
271 ipHdr
->version
= IPPROTO_IPV4
;
272 ipHdr
->tos
= tunKey
->tos
;
273 ipHdr
->tot_len
= htons(NET_BUFFER_DATA_LENGTH(curNb
) - sizeof *ethHdr
);
274 ipHdr
->id
= (uint16
)atomic_add64(&vportVxlan
->ipId
,
275 NET_BUFFER_DATA_LENGTH(curNb
));
276 ipHdr
->frag_off
= (tunKey
->flags
& OVS_TNL_F_DONT_FRAGMENT
) ?
278 ipHdr
->ttl
= tunKey
->ttl
? tunKey
->ttl
: VXLAN_DEFAULT_TTL
;
279 ipHdr
->protocol
= IPPROTO_UDP
;
280 ASSERT(tunKey
->dst
== fwdInfo
->dstIpAddr
);
281 ASSERT(tunKey
->src
== fwdInfo
->srcIpAddr
|| tunKey
->src
== 0);
282 ipHdr
->saddr
= fwdInfo
->srcIpAddr
;
283 ipHdr
->daddr
= fwdInfo
->dstIpAddr
;
288 udpHdr
= (UDPHdr
*)((PCHAR
)ipHdr
+ sizeof *ipHdr
);
289 udpHdr
->source
= htons(tunKey
->flow_hash
| MAXINT16
);
290 udpHdr
->dest
= tunKey
->dst_port
? tunKey
->dst_port
:
291 htons(vportVxlan
->dstPort
);
292 udpHdr
->len
= htons(NET_BUFFER_DATA_LENGTH(curNb
) - headRoom
+
293 sizeof *udpHdr
+ sizeof *vxlanHdr
);
295 if (tunKey
->flags
& OVS_TNL_F_CSUM
) {
296 udpHdr
->check
= IPPseudoChecksum(&ipHdr
->saddr
, &ipHdr
->daddr
,
297 IPPROTO_UDP
, ntohs(udpHdr
->len
));
303 vxlanHdr
= (VXLANHdr
*)((PCHAR
)udpHdr
+ sizeof *udpHdr
);
304 vxlanHdr
->flags1
= 0;
305 vxlanHdr
->locallyReplicate
= 0;
306 vxlanHdr
->flags2
= 0;
307 vxlanHdr
->reserved1
= 0;
308 vxlanHdr
->vxlanID
= VXLAN_TUNNELID_TO_VNI(tunKey
->tunnelId
);
309 vxlanHdr
->instanceID
= 1;
310 vxlanHdr
->reserved2
= 0;
314 csumInfo
.Transmit
.IpHeaderChecksum
= 1;
315 csumInfo
.Transmit
.IsIPv4
= 1;
316 if (tunKey
->flags
& OVS_TNL_F_CSUM
) {
317 csumInfo
.Transmit
.UdpChecksum
= 1;
319 NET_BUFFER_LIST_INFO(curNbl
,
320 TcpIpChecksumNetBufferListInfo
) = csumInfo
.Value
;
323 return STATUS_SUCCESS
;
326 OvsCompleteNBL(switchContext
, *newNbl
, TRUE
);
333 *----------------------------------------------------------------------------
335 * Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
336 * enqueues a callback that does encapsulation after resolution.
337 *----------------------------------------------------------------------------
340 OvsEncapVxlan(POVS_VPORT_ENTRY vport
,
341 PNET_BUFFER_LIST curNbl
,
342 OvsIPv4TunnelKey
*tunKey
,
343 POVS_SWITCH_CONTEXT switchContext
,
344 POVS_PACKET_HDR_INFO layers
,
345 PNET_BUFFER_LIST
*newNbl
,
346 POVS_FWD_INFO switchFwdInfo
)
349 OVS_FWD_INFO fwdInfo
;
351 status
= OvsLookupIPFwdInfo(tunKey
->src
, tunKey
->dst
, &fwdInfo
);
352 if (status
!= STATUS_SUCCESS
) {
353 OvsFwdIPHelperRequest(NULL
, 0, tunKey
, NULL
, NULL
, NULL
);
355 * XXX: Don't know if the completionList will make any sense when
356 * accessed in the callback. Make sure the caveats are known.
358 * XXX: This code will work once we are able to grab locks in the
361 return NDIS_STATUS_FAILURE
;
364 RtlCopyMemory(switchFwdInfo
->value
, fwdInfo
.value
, sizeof fwdInfo
.value
);
366 return OvsDoEncapVxlan(vport
, curNbl
, tunKey
, &fwdInfo
, layers
,
367 switchContext
, newNbl
);
372 *----------------------------------------------------------------------------
374 * Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
375 *----------------------------------------------------------------------------
378 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext
,
379 PNET_BUFFER_LIST curNbl
,
380 OvsIPv4TunnelKey
*tunKey
,
381 PNET_BUFFER_LIST
*newNbl
)
389 UINT32 tunnelSize
, packetLength
;
392 OVS_PACKET_HDR_INFO layers
= { 0 };
394 status
= OvsExtractLayers(curNbl
, &layers
);
395 if (status
!= NDIS_STATUS_SUCCESS
) {
399 /* Check the length of the UDP payload */
400 curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
);
401 packetLength
= NET_BUFFER_DATA_LENGTH(curNb
);
402 tunnelSize
= OvsGetVxlanTunHdrSizeFromLayers(&layers
);
403 if (packetLength
< tunnelSize
) {
404 return NDIS_STATUS_INVALID_LENGTH
;
408 * Create a copy of the NBL so that we have all the headers in one MDL.
410 *newNbl
= OvsPartialCopyNBL(switchContext
, curNbl
,
412 TRUE
/*copy NBL info */);
414 if (*newNbl
== NULL
) {
415 return NDIS_STATUS_RESOURCES
;
418 /* XXX: Handle VLAN header. */
420 curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
);
421 curMdl
= NET_BUFFER_CURRENT_MDL(curNb
);
422 bufferStart
= (PUINT8
)OvsGetMdlWithLowPriority(curMdl
)
423 + NET_BUFFER_CURRENT_MDL_OFFSET(curNb
);
425 status
= NDIS_STATUS_RESOURCES
;
429 ethHdr
= (EthHdr
*)bufferStart
;
430 /* XXX: Handle IP options. */
431 ipHdr
= (IPHdr
*)(bufferStart
+ layers
.l3Offset
);
432 tunKey
->src
= ipHdr
->saddr
;
433 tunKey
->dst
= ipHdr
->daddr
;
434 tunKey
->tos
= ipHdr
->tos
;
435 tunKey
->ttl
= ipHdr
->ttl
;
437 udpHdr
= (UDPHdr
*)(bufferStart
+ layers
.l4Offset
);
439 /* Validate if NIC has indicated checksum failure. */
440 status
= OvsValidateUDPChecksum(curNbl
, udpHdr
->check
== 0);
441 if (status
!= NDIS_STATUS_SUCCESS
) {
445 /* Calculate and verify UDP checksum if NIC didn't do it. */
446 if (udpHdr
->check
!= 0) {
447 tunKey
->flags
|= OVS_TNL_F_CSUM
;
448 status
= OvsCalculateUDPChecksum(curNbl
, curNb
, ipHdr
, udpHdr
,
449 packetLength
, &layers
);
450 if (status
!= NDIS_STATUS_SUCCESS
) {
455 vxlanHdr
= (VXLANHdr
*)((PCHAR
)udpHdr
+ sizeof *udpHdr
);
456 if (vxlanHdr
->instanceID
) {
457 tunKey
->flags
|= OVS_TNL_F_KEY
;
458 tunKey
->tunnelId
= VXLAN_VNI_TO_TUNNELID(vxlanHdr
->vxlanID
);
460 tunKey
->flags
&= ~OVS_TNL_F_KEY
;
461 tunKey
->tunnelId
= 0;
464 /* Clear out the receive flag for the inner packet. */
465 NET_BUFFER_LIST_INFO(curNbl
, TcpIpChecksumNetBufferListInfo
) = 0;
466 NdisAdvanceNetBufferDataStart(curNb
, tunnelSize
, FALSE
, NULL
);
467 return NDIS_STATUS_SUCCESS
;
470 OvsCompleteNBL(switchContext
, *newNbl
, TRUE
);
477 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet
,
478 OvsIPv4TunnelKey
*tunnelKey
)
480 NDIS_STATUS status
= NDIS_STATUS_FAILURE
;
483 VXLANHdr
*VxlanHeader
;
484 VXLANHdr VxlanHeaderBuffer
;
485 struct IPHdr ip_storage
;
486 const struct IPHdr
*nh
;
487 OVS_PACKET_HDR_INFO layers
;
492 nh
= OvsGetIp(packet
, layers
.l3Offset
, &ip_storage
);
494 layers
.l4Offset
= layers
.l3Offset
+ nh
->ihl
* 4;
496 status
= NDIS_STATUS_INVALID_PACKET
;
500 /* make sure it's a VXLAN packet */
501 udp
= OvsGetUdp(packet
, layers
.l4Offset
, &udpStorage
);
503 layers
.l7Offset
= layers
.l4Offset
+ sizeof *udp
;
504 if (udp
->check
!= 0) {
505 tunnelKey
->flags
|= OVS_TNL_F_CSUM
;
511 VxlanHeader
= (VXLANHdr
*)OvsGetPacketBytes(packet
,
512 sizeof(*VxlanHeader
),
517 tunnelKey
->src
= nh
->saddr
;
518 tunnelKey
->dst
= nh
->daddr
;
519 tunnelKey
->ttl
= nh
->ttl
;
520 tunnelKey
->tos
= nh
->tos
;
521 if (VxlanHeader
->instanceID
) {
522 tunnelKey
->flags
|= OVS_TNL_F_KEY
;
523 tunnelKey
->tunnelId
= VXLAN_VNI_TO_TUNNELID(VxlanHeader
->vxlanID
);
525 tunnelKey
->flags
&= ~OVS_TNL_F_KEY
;
526 tunnelKey
->tunnelId
= 0;
531 status
= NDIS_STATUS_SUCCESS
;
538 #pragma warning( pop )