2 * Copyright (c) 2014 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
28 #include "PacketParser.h"
30 #pragma warning( push )
31 #pragma warning( disable:4127 )
37 #define OVS_DBG_MOD OVS_DBG_VXLAN
40 /* Helper macro to check if a VXLAN ID is valid. */
41 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
42 #define VXLAN_TUNNELID_TO_VNI(_tID) (UINT32)(((UINT64)(_tID)) >> 40)
43 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
44 #define IP_DF_NBO 0x0040
45 #define VXLAN_DEFAULT_TTL 64
46 #define VXLAN_MULTICAST_TTL 64
47 #define VXLAN_DEFAULT_INSTANCE_ID 1
49 /* Move to a header file */
50 extern POVS_SWITCH_CONTEXT gOvsSwitchContext
;
53 *----------------------------------------------------------------------------
54 * This function verifies if the VXLAN tunnel already exists, in order to
55 * avoid sending a duplicate request to the WFP base filtering engine.
56 *----------------------------------------------------------------------------
59 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext
,
62 for (UINT hash
= 0; hash
< OVS_MAX_VPORT_ARRAY_SIZE
; hash
++) {
63 PLIST_ENTRY head
, link
, next
;
65 head
= &(switchContext
->portNoHashArray
[hash
& OVS_VPORT_MASK
]);
66 LIST_FORALL_SAFE(head
, link
, next
) {
67 POVS_VPORT_ENTRY vport
= NULL
;
68 POVS_VXLAN_VPORT vxlanPort
= NULL
;
69 vport
= CONTAINING_RECORD(link
, OVS_VPORT_ENTRY
, portNoLink
);
70 vxlanPort
= (POVS_VXLAN_VPORT
)vport
->priv
;
72 if ((udpPortDest
== vxlanPort
->dstPort
)) {
73 /* The VXLAN tunnel was already created. */
84 *----------------------------------------------------------------------------
85 * This function allocates and initializes the OVS_VXLAN_VPORT. The function
86 * also creates a WFP tunnel filter for the necessary destination port. The
87 * tunnel filter create request is passed to the tunnel filter threads that
88 * will complete the request at a later time when IRQL is lowered to
91 * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
92 * port of an udp frame is udpDestPort, we understand it to be vxlan.
93 *----------------------------------------------------------------------------
96 OvsInitVxlanTunnel(PIRP irp
,
97 POVS_VPORT_ENTRY vport
,
99 PFNTunnelVportPendingOp callback
,
102 NTSTATUS status
= STATUS_SUCCESS
;
103 POVS_VXLAN_VPORT vxlanPort
= NULL
;
105 vxlanPort
= OvsAllocateMemoryWithTag(sizeof (*vxlanPort
),
107 if (vxlanPort
== NULL
) {
108 return STATUS_INSUFFICIENT_RESOURCES
;
111 RtlZeroMemory(vxlanPort
, sizeof(*vxlanPort
));
112 vxlanPort
->dstPort
= udpDestPort
;
113 vport
->priv
= (PVOID
)vxlanPort
;
115 if (!OvsIsTunnelFilterCreated(gOvsSwitchContext
, udpDestPort
)) {
116 status
= OvsTunelFilterCreate(irp
,
118 &vxlanPort
->filterID
,
122 status
= STATUS_OBJECT_NAME_EXISTS
;
129 *----------------------------------------------------------------------------
130 * This function releases the OVS_VXLAN_VPORT. The function also deletes the
131 * WFP tunnel filter previously created. The tunnel filter delete request is
132 * passed to the tunnel filter threads that will complete the request at a
133 * later time when IRQL is lowered to PASSIVE_LEVEL.
134 *----------------------------------------------------------------------------
137 OvsCleanupVxlanTunnel(PIRP irp
,
138 POVS_VPORT_ENTRY vport
,
139 PFNTunnelVportPendingOp callback
,
142 NTSTATUS status
= STATUS_SUCCESS
;
143 POVS_VXLAN_VPORT vxlanPort
= NULL
;
145 if (vport
->ovsType
!= OVS_VPORT_TYPE_VXLAN
||
146 vport
->priv
== NULL
) {
147 return STATUS_SUCCESS
;
150 vxlanPort
= (POVS_VXLAN_VPORT
)vport
->priv
;
152 if (vxlanPort
->filterID
!= 0) {
153 status
= OvsTunelFilterDelete(irp
,
159 OvsFreeMemoryWithTag(vport
->priv
, OVS_VXLAN_POOL_TAG
);
167 *----------------------------------------------------------------------------
169 * Encapsulates the packet.
170 *----------------------------------------------------------------------------
172 static __inline NDIS_STATUS
173 OvsDoEncapVxlan(PNET_BUFFER_LIST curNbl
,
174 OvsIPv4TunnelKey
*tunKey
,
175 POVS_FWD_INFO fwdInfo
,
176 POVS_PACKET_HDR_INFO layers
,
177 POVS_SWITCH_CONTEXT switchContext
,
178 PNET_BUFFER_LIST
*newNbl
)
188 UINT32 headRoom
= OvsGetVxlanTunHdrSize();
192 * XXX: the assumption currently is that the NBL is owned by OVS, and
193 * headroom has already been allocated as part of allocating the NBL and
196 curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
);
197 packetLength
= NET_BUFFER_DATA_LENGTH(curNb
);
199 NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo
;
201 tsoInfo
.Value
= NET_BUFFER_LIST_INFO(curNbl
,
202 TcpLargeSendNetBufferListInfo
);
203 OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo
.LsoV1Transmit
.MSS
, packetLength
);
204 if (tsoInfo
.LsoV1Transmit
.MSS
) {
205 OVS_LOG_TRACE("l4Offset %d", layers
->l4Offset
);
206 *newNbl
= OvsTcpSegmentNBL(switchContext
, curNbl
, layers
,
207 tsoInfo
.LsoV1Transmit
.MSS
, headRoom
);
208 if (*newNbl
== NULL
) {
209 OVS_LOG_ERROR("Unable to segment NBL");
210 return NDIS_STATUS_FAILURE
;
214 /* If we didn't split the packet above, make a copy now */
215 if (*newNbl
== NULL
) {
216 *newNbl
= OvsPartialCopyNBL(switchContext
, curNbl
, 0, headRoom
,
218 if (*newNbl
== NULL
) {
219 OVS_LOG_ERROR("Unable to copy NBL");
220 return NDIS_STATUS_FAILURE
;
225 for (curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
); curNb
!= NULL
;
226 curNb
= curNb
->Next
) {
227 status
= NdisRetreatNetBufferDataStart(curNb
, headRoom
, 0, NULL
);
228 if (status
!= NDIS_STATUS_SUCCESS
) {
232 curMdl
= NET_BUFFER_CURRENT_MDL(curNb
);
233 bufferStart
= (PUINT8
)MmGetSystemAddressForMdlSafe(curMdl
, LowPagePriority
);
235 status
= NDIS_STATUS_RESOURCES
;
239 bufferStart
+= NET_BUFFER_CURRENT_MDL_OFFSET(curNb
);
240 if (NET_BUFFER_NEXT_NB(curNb
)) {
241 OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb
),
242 NET_BUFFER_DATA_LENGTH(curNb
->Next
));
246 ethHdr
= (EthHdr
*)bufferStart
;
247 ASSERT(((PCHAR
)&fwdInfo
->dstMacAddr
+ sizeof fwdInfo
->dstMacAddr
) ==
248 (PCHAR
)&fwdInfo
->srcMacAddr
);
249 NdisMoveMemory(ethHdr
->Destination
, fwdInfo
->dstMacAddr
,
250 sizeof ethHdr
->Destination
+ sizeof ethHdr
->Source
);
251 ethHdr
->Type
= htons(ETH_TYPE_IPV4
);
253 // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
254 // should we use those values instead? or will they end up being
257 ipHdr
= (IPHdr
*)((PCHAR
)ethHdr
+ sizeof *ethHdr
);
259 ipHdr
->ihl
= sizeof *ipHdr
/ 4;
260 ipHdr
->version
= IPV4
;
262 ipHdr
->tot_len
= htons(NET_BUFFER_DATA_LENGTH(curNb
) - sizeof *ethHdr
);
264 ipHdr
->frag_off
= IP_DF_NBO
;
265 ipHdr
->ttl
= tunKey
->ttl
? tunKey
->ttl
: VXLAN_DEFAULT_TTL
;
266 ipHdr
->protocol
= IPPROTO_UDP
;
267 ASSERT(tunKey
->dst
== fwdInfo
->dstIpAddr
);
268 ASSERT(tunKey
->src
== fwdInfo
->srcIpAddr
|| tunKey
->src
== 0);
269 ipHdr
->saddr
= fwdInfo
->srcIpAddr
;
270 ipHdr
->daddr
= fwdInfo
->dstIpAddr
;
272 ipHdr
->check
= IPChecksum((UINT8
*)ipHdr
, sizeof *ipHdr
, 0);
275 udpHdr
= (UDPHdr
*)((PCHAR
)ipHdr
+ sizeof *ipHdr
);
276 udpHdr
->source
= htons(tunKey
->flow_hash
| 32768);
277 udpHdr
->dest
= htons(tunKey
->dst_port
);
278 udpHdr
->len
= htons(NET_BUFFER_DATA_LENGTH(curNb
) - headRoom
+
279 sizeof *udpHdr
+ sizeof *vxlanHdr
);
283 vxlanHdr
= (VXLANHdr
*)((PCHAR
)udpHdr
+ sizeof *udpHdr
);
284 vxlanHdr
->flags1
= 0;
285 vxlanHdr
->locallyReplicate
= 0;
286 vxlanHdr
->flags2
= 0;
287 vxlanHdr
->reserved1
= 0;
288 if (tunKey
->flags
| OVS_TNL_F_KEY
) {
289 vxlanHdr
->vxlanID
= VXLAN_TUNNELID_TO_VNI(tunKey
->tunnelId
);
290 vxlanHdr
->instanceID
= 1;
292 vxlanHdr
->reserved2
= 0;
294 return STATUS_SUCCESS
;
297 OvsCompleteNBL(switchContext
, *newNbl
, TRUE
);
304 *----------------------------------------------------------------------------
306 * Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
307 * enqueues a callback that does encapsulatation after resolution.
308 *----------------------------------------------------------------------------
311 OvsEncapVxlan(PNET_BUFFER_LIST curNbl
,
312 OvsIPv4TunnelKey
*tunKey
,
313 POVS_SWITCH_CONTEXT switchContext
,
314 POVS_PACKET_HDR_INFO layers
,
315 PNET_BUFFER_LIST
*newNbl
)
318 OVS_FWD_INFO fwdInfo
;
320 status
= OvsLookupIPFwdInfo(tunKey
->dst
, &fwdInfo
);
321 if (status
!= STATUS_SUCCESS
) {
322 OvsFwdIPHelperRequest(NULL
, 0, tunKey
, NULL
, NULL
, NULL
);
323 // return NDIS_STATUS_PENDING;
325 * XXX: Don't know if the completionList will make any sense when
326 * accessed in the callback. Make sure the caveats are known.
328 * XXX: This code will work once we are able to grab locks in the
331 return NDIS_STATUS_FAILURE
;
334 return OvsDoEncapVxlan(curNbl
, tunKey
, &fwdInfo
, layers
,
335 switchContext
, newNbl
);
340 *----------------------------------------------------------------------------
341 * OvsIpHlprCbVxlan --
342 * Callback function for IP helper.
343 * XXX: not used currently
344 *----------------------------------------------------------------------------
347 OvsIpHlprCbVxlan(PNET_BUFFER_LIST curNbl
,
349 OvsIPv4TunnelKey
*tunKey
,
353 POVS_FWD_INFO fwdInfo
)
355 OVS_PACKET_HDR_INFO layers
;
358 UNREFERENCED_PARAMETER(inPort
);
360 status
= OvsExtractFlow(curNbl
, inPort
, &key
, &layers
, NULL
);
361 if (result
== STATUS_SUCCESS
) {
362 status
= OvsDoEncapVxlan(curNbl
, tunKey
, fwdInfo
, &layers
,
363 (POVS_SWITCH_CONTEXT
)cbData1
, NULL
);
365 status
= NDIS_STATUS_FAILURE
;
368 if (status
!= NDIS_STATUS_SUCCESS
) {
369 // XXX: Free up the NBL;
373 OvsLookupFlowOutput((POVS_SWITCH_CONTEXT
)cbData1
, cbData2
, curNbl
);
377 *----------------------------------------------------------------------------
378 * OvsCalculateUDPChecksum
379 * Calculate UDP checksum
380 *----------------------------------------------------------------------------
382 static __inline NDIS_STATUS
383 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl
,
389 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo
;
392 csumInfo
.Value
= NET_BUFFER_LIST_INFO(curNbl
, TcpIpChecksumNetBufferListInfo
);
394 /* Next check if UDP checksum has been calculated. */
395 if (!csumInfo
.Receive
.UdpChecksumSucceeded
) {
398 checkSum
= udpHdr
->check
;
400 l4Payload
= packetLength
- sizeof(EthHdr
) - ipHdr
->ihl
* 4;
403 IPPseudoChecksum((UINT32
*)&ipHdr
->saddr
,
404 (UINT32
*)&ipHdr
->daddr
,
405 IPPROTO_UDP
, (UINT16
)l4Payload
);
406 udpHdr
->check
= CalculateChecksumNB(curNb
, (UINT16
)l4Payload
,
407 sizeof(EthHdr
) + ipHdr
->ihl
* 4);
408 if (checkSum
!= udpHdr
->check
) {
409 OVS_LOG_TRACE("UDP checksum incorrect.");
410 return NDIS_STATUS_INVALID_PACKET
;
414 csumInfo
.Receive
.UdpChecksumSucceeded
= 1;
415 NET_BUFFER_LIST_INFO(curNbl
, TcpIpChecksumNetBufferListInfo
) = csumInfo
.Value
;
416 return NDIS_STATUS_SUCCESS
;
420 *----------------------------------------------------------------------------
422 * Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
423 *----------------------------------------------------------------------------
426 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext
,
427 PNET_BUFFER_LIST curNbl
,
428 OvsIPv4TunnelKey
*tunKey
,
429 PNET_BUFFER_LIST
*newNbl
)
437 UINT32 tunnelSize
= 0, packetLength
= 0;
441 /* Check the the length of the UDP payload */
442 curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
);
443 packetLength
= NET_BUFFER_DATA_LENGTH(curNb
);
444 tunnelSize
= OvsGetVxlanTunHdrSize();
445 if (packetLength
<= tunnelSize
) {
446 return NDIS_STATUS_INVALID_LENGTH
;
450 * Create a copy of the NBL so that we have all the headers in one MDL.
452 *newNbl
= OvsPartialCopyNBL(switchContext
, curNbl
,
453 tunnelSize
+ OVS_DEFAULT_COPY_SIZE
, 0,
454 TRUE
/*copy NBL info */);
456 if (*newNbl
== NULL
) {
457 return NDIS_STATUS_RESOURCES
;
460 /* XXX: Handle VLAN header. */
462 curNb
= NET_BUFFER_LIST_FIRST_NB(curNbl
);
463 curMdl
= NET_BUFFER_CURRENT_MDL(curNb
);
464 bufferStart
= (PUINT8
)MmGetSystemAddressForMdlSafe(curMdl
, LowPagePriority
) +
465 NET_BUFFER_CURRENT_MDL_OFFSET(curNb
);
467 status
= NDIS_STATUS_RESOURCES
;
471 ethHdr
= (EthHdr
*)bufferStart
;
472 /* XXX: Handle IP options. */
473 ipHdr
= (IPHdr
*)((PCHAR
)ethHdr
+ sizeof *ethHdr
);
474 tunKey
->src
= ipHdr
->saddr
;
475 tunKey
->dst
= ipHdr
->daddr
;
476 tunKey
->tos
= ipHdr
->tos
;
477 tunKey
->ttl
= ipHdr
->ttl
;
479 udpHdr
= (UDPHdr
*)((PCHAR
)ipHdr
+ sizeof *ipHdr
);
481 /* Validate if NIC has indicated checksum failure. */
482 status
= OvsValidateUDPChecksum(curNbl
, udpHdr
->check
== 0);
483 if (status
!= NDIS_STATUS_SUCCESS
) {
487 /* Calculate and verify UDP checksum if NIC didn't do it. */
488 if (udpHdr
->check
!= 0) {
489 status
= OvsCalculateUDPChecksum(curNbl
, curNb
, ipHdr
, udpHdr
, packetLength
);
490 if (status
!= NDIS_STATUS_SUCCESS
) {
495 vxlanHdr
= (VXLANHdr
*)((PCHAR
)udpHdr
+ sizeof *udpHdr
);
496 if (vxlanHdr
->instanceID
) {
497 tunKey
->flags
= OVS_TNL_F_KEY
;
498 tunKey
->tunnelId
= VXLAN_VNI_TO_TUNNELID(vxlanHdr
->vxlanID
);
501 tunKey
->tunnelId
= 0;
504 /* Clear out the receive flag for the inner packet. */
505 NET_BUFFER_LIST_INFO(curNbl
, TcpIpChecksumNetBufferListInfo
) = 0;
506 NdisAdvanceNetBufferDataStart(curNb
, tunnelSize
, FALSE
, NULL
);
507 return NDIS_STATUS_SUCCESS
;
510 OvsCompleteNBL(switchContext
, *newNbl
, TRUE
);
517 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet
,
518 OvsIPv4TunnelKey
*tunnelKey
)
520 NDIS_STATUS status
= NDIS_STATUS_FAILURE
;
523 VXLANHdr
*VxlanHeader
;
524 VXLANHdr VxlanHeaderBuffer
;
525 struct IPHdr ip_storage
;
526 const struct IPHdr
*nh
;
527 OVS_PACKET_HDR_INFO layers
;
532 nh
= OvsGetIp(packet
, layers
.l3Offset
, &ip_storage
);
534 layers
.l4Offset
= layers
.l3Offset
+ nh
->ihl
* 4;
539 /* make sure it's a VXLAN packet */
540 udp
= OvsGetUdp(packet
, layers
.l4Offset
, &udpStorage
);
542 layers
.l7Offset
= layers
.l4Offset
+ sizeof *udp
;
547 VxlanHeader
= (VXLANHdr
*)OvsGetPacketBytes(packet
,
548 sizeof(*VxlanHeader
),
553 tunnelKey
->src
= nh
->saddr
;
554 tunnelKey
->dst
= nh
->daddr
;
555 tunnelKey
->ttl
= nh
->ttl
;
556 tunnelKey
->tos
= nh
->tos
;
557 if (VxlanHeader
->instanceID
) {
558 tunnelKey
->flags
= OVS_TNL_F_KEY
;
559 tunnelKey
->tunnelId
= VXLAN_VNI_TO_TUNNELID(VxlanHeader
->vxlanID
);
561 tunnelKey
->flags
= 0;
562 tunnelKey
->tunnelId
= 0;
567 status
= NDIS_STATUS_SUCCESS
;
574 #pragma warning( pop )