]> git.proxmox.com Git - ovs.git/blob - datapath-windows/ovsext/Vxlan.c
datapath-windows: Stateless TCP Tunnelling protocol - Initial implementation
[ovs.git] / datapath-windows / ovsext / Vxlan.c
1 /*
2 * Copyright (c) 2014 VMware, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "precomp.h"
18 #include "NetProto.h"
19 #include "Switch.h"
20 #include "Vport.h"
21 #include "Flow.h"
22 #include "Vxlan.h"
23 #include "IpHelper.h"
24 #include "Checksum.h"
25 #include "User.h"
26 #include "PacketIO.h"
27 #include "Flow.h"
28 #include "PacketParser.h"
29
30 #pragma warning( push )
31 #pragma warning( disable:4127 )
32
33
34 #ifdef OVS_DBG_MOD
35 #undef OVS_DBG_MOD
36 #endif
37 #define OVS_DBG_MOD OVS_DBG_VXLAN
38 #include "Debug.h"
39
40 /* Helper macro to check if a VXLAN ID is valid. */
41 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
42 #define VXLAN_TUNNELID_TO_VNI(_tID) (UINT32)(((UINT64)(_tID)) >> 40)
43 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
44 #define IP_DF_NBO 0x0040
45 #define VXLAN_DEFAULT_TTL 64
46 #define VXLAN_MULTICAST_TTL 64
47 #define VXLAN_DEFAULT_INSTANCE_ID 1
48
49 /* Move to a header file */
50 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
51
52 /*
53 *----------------------------------------------------------------------------
54 * This function verifies if the VXLAN tunnel already exists, in order to
55 * avoid sending a duplicate request to the WFP base filtering engine.
56 *----------------------------------------------------------------------------
57 */
58 static BOOLEAN
59 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext,
60 UINT16 udpPortDest)
61 {
62 for (UINT hash = 0; hash < OVS_MAX_VPORT_ARRAY_SIZE; hash++) {
63 PLIST_ENTRY head, link, next;
64
65 head = &(switchContext->portNoHashArray[hash & OVS_VPORT_MASK]);
66 LIST_FORALL_SAFE(head, link, next) {
67 POVS_VPORT_ENTRY vport = NULL;
68 POVS_VXLAN_VPORT vxlanPort = NULL;
69 vport = CONTAINING_RECORD(link, OVS_VPORT_ENTRY, portNoLink);
70 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
71 if (vxlanPort) {
72 if ((udpPortDest == vxlanPort->dstPort)) {
73 /* The VXLAN tunnel was already created. */
74 return TRUE;
75 }
76 }
77 }
78 }
79
80 return FALSE;
81 }
82
83 /*
84 *----------------------------------------------------------------------------
85 * This function allocates and initializes the OVS_VXLAN_VPORT. The function
86 * also creates a WFP tunnel filter for the necessary destination port. The
87 * tunnel filter create request is passed to the tunnel filter threads that
88 * will complete the request at a later time when IRQL is lowered to
89 * PASSIVE_LEVEL.
90 *
91 * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
92 * port of an udp frame is udpDestPort, we understand it to be vxlan.
93 *----------------------------------------------------------------------------
94 */
95 NTSTATUS
96 OvsInitVxlanTunnel(PIRP irp,
97 POVS_VPORT_ENTRY vport,
98 UINT16 udpDestPort,
99 PFNTunnelVportPendingOp callback,
100 PVOID tunnelContext)
101 {
102 NTSTATUS status = STATUS_SUCCESS;
103 POVS_VXLAN_VPORT vxlanPort = NULL;
104
105 vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
106 OVS_VXLAN_POOL_TAG);
107 if (vxlanPort == NULL) {
108 return STATUS_INSUFFICIENT_RESOURCES;
109 }
110
111 RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
112 vxlanPort->dstPort = udpDestPort;
113 vport->priv = (PVOID)vxlanPort;
114
115 if (!OvsIsTunnelFilterCreated(gOvsSwitchContext, udpDestPort)) {
116 status = OvsTunelFilterCreate(irp,
117 udpDestPort,
118 &vxlanPort->filterID,
119 callback,
120 tunnelContext);
121 } else {
122 status = STATUS_OBJECT_NAME_EXISTS;
123 }
124
125 return status;
126 }
127
128 /*
129 *----------------------------------------------------------------------------
130 * This function releases the OVS_VXLAN_VPORT. The function also deletes the
131 * WFP tunnel filter previously created. The tunnel filter delete request is
132 * passed to the tunnel filter threads that will complete the request at a
133 * later time when IRQL is lowered to PASSIVE_LEVEL.
134 *----------------------------------------------------------------------------
135 */
136 NTSTATUS
137 OvsCleanupVxlanTunnel(PIRP irp,
138 POVS_VPORT_ENTRY vport,
139 PFNTunnelVportPendingOp callback,
140 PVOID tunnelContext)
141 {
142 NTSTATUS status = STATUS_SUCCESS;
143 POVS_VXLAN_VPORT vxlanPort = NULL;
144
145 if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
146 vport->priv == NULL) {
147 return STATUS_SUCCESS;
148 }
149
150 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
151
152 if (vxlanPort->filterID != 0) {
153 status = OvsTunelFilterDelete(irp,
154 vxlanPort->filterID,
155 callback,
156 tunnelContext);
157 }
158
159 OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
160 vport->priv = NULL;
161
162 return status;
163 }
164
165
166 /*
167 *----------------------------------------------------------------------------
168 * OvsDoEncapVxlan
169 * Encapsulates the packet.
170 *----------------------------------------------------------------------------
171 */
172 static __inline NDIS_STATUS
173 OvsDoEncapVxlan(PNET_BUFFER_LIST curNbl,
174 OvsIPv4TunnelKey *tunKey,
175 POVS_FWD_INFO fwdInfo,
176 POVS_PACKET_HDR_INFO layers,
177 POVS_SWITCH_CONTEXT switchContext,
178 PNET_BUFFER_LIST *newNbl)
179 {
180 NDIS_STATUS status;
181 PNET_BUFFER curNb;
182 PMDL curMdl;
183 PUINT8 bufferStart;
184 EthHdr *ethHdr;
185 IPHdr *ipHdr;
186 UDPHdr *udpHdr;
187 VXLANHdr *vxlanHdr;
188 UINT32 headRoom = OvsGetVxlanTunHdrSize();
189 UINT32 packetLength;
190
191 /*
192 * XXX: the assumption currently is that the NBL is owned by OVS, and
193 * headroom has already been allocated as part of allocating the NBL and
194 * MDL.
195 */
196 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
197 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
198 if (layers->isTcp) {
199 NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
200
201 tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
202 TcpLargeSendNetBufferListInfo);
203 OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS, packetLength);
204 if (tsoInfo.LsoV1Transmit.MSS) {
205 OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
206 *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
207 tsoInfo.LsoV1Transmit.MSS, headRoom);
208 if (*newNbl == NULL) {
209 OVS_LOG_ERROR("Unable to segment NBL");
210 return NDIS_STATUS_FAILURE;
211 }
212 }
213 }
214 /* If we didn't split the packet above, make a copy now */
215 if (*newNbl == NULL) {
216 *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
217 FALSE /*NBL info*/);
218 if (*newNbl == NULL) {
219 OVS_LOG_ERROR("Unable to copy NBL");
220 return NDIS_STATUS_FAILURE;
221 }
222 }
223
224 curNbl = *newNbl;
225 for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
226 curNb = curNb->Next) {
227 status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
228 if (status != NDIS_STATUS_SUCCESS) {
229 goto ret_error;
230 }
231
232 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
233 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
234 if (!bufferStart) {
235 status = NDIS_STATUS_RESOURCES;
236 goto ret_error;
237 }
238
239 bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
240 if (NET_BUFFER_NEXT_NB(curNb)) {
241 OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
242 NET_BUFFER_DATA_LENGTH(curNb->Next));
243 }
244
245 /* L2 header */
246 ethHdr = (EthHdr *)bufferStart;
247 ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
248 (PCHAR)&fwdInfo->srcMacAddr);
249 NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
250 sizeof ethHdr->Destination + sizeof ethHdr->Source);
251 ethHdr->Type = htons(ETH_TYPE_IPV4);
252
253 // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
254 // should we use those values instead? or will they end up being
255 // uninitialized;
256 /* IP header */
257 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
258
259 ipHdr->ihl = sizeof *ipHdr / 4;
260 ipHdr->version = IPV4;
261 ipHdr->tos = 0;
262 ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
263 ipHdr->id = 0;
264 ipHdr->frag_off = IP_DF_NBO;
265 ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
266 ipHdr->protocol = IPPROTO_UDP;
267 ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
268 ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
269 ipHdr->saddr = fwdInfo->srcIpAddr;
270 ipHdr->daddr = fwdInfo->dstIpAddr;
271 ipHdr->check = 0;
272 ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
273
274 /* UDP header */
275 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
276 udpHdr->source = htons(tunKey->flow_hash | 32768);
277 udpHdr->dest = htons(tunKey->dst_port);
278 udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
279 sizeof *udpHdr + sizeof *vxlanHdr);
280 udpHdr->check = 0;
281
282 /* VXLAN header */
283 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
284 vxlanHdr->flags1 = 0;
285 vxlanHdr->locallyReplicate = 0;
286 vxlanHdr->flags2 = 0;
287 vxlanHdr->reserved1 = 0;
288 if (tunKey->flags | OVS_TNL_F_KEY) {
289 vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
290 vxlanHdr->instanceID = 1;
291 }
292 vxlanHdr->reserved2 = 0;
293 }
294 return STATUS_SUCCESS;
295
296 ret_error:
297 OvsCompleteNBL(switchContext, *newNbl, TRUE);
298 *newNbl = NULL;
299 return status;
300 }
301
302
303 /*
304 *----------------------------------------------------------------------------
305 * OvsEncapVxlan --
306 * Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
307 * enqueues a callback that does encapsulatation after resolution.
308 *----------------------------------------------------------------------------
309 */
310 NDIS_STATUS
311 OvsEncapVxlan(PNET_BUFFER_LIST curNbl,
312 OvsIPv4TunnelKey *tunKey,
313 POVS_SWITCH_CONTEXT switchContext,
314 POVS_PACKET_HDR_INFO layers,
315 PNET_BUFFER_LIST *newNbl)
316 {
317 NTSTATUS status;
318 OVS_FWD_INFO fwdInfo;
319
320 status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
321 if (status != STATUS_SUCCESS) {
322 OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
323 // return NDIS_STATUS_PENDING;
324 /*
325 * XXX: Don't know if the completionList will make any sense when
326 * accessed in the callback. Make sure the caveats are known.
327 *
328 * XXX: This code will work once we are able to grab locks in the
329 * callback.
330 */
331 return NDIS_STATUS_FAILURE;
332 }
333
334 return OvsDoEncapVxlan(curNbl, tunKey, &fwdInfo, layers,
335 switchContext, newNbl);
336 }
337
338
339 /*
340 *----------------------------------------------------------------------------
341 * OvsIpHlprCbVxlan --
342 * Callback function for IP helper.
343 * XXX: not used currently
344 *----------------------------------------------------------------------------
345 */
346 static VOID
347 OvsIpHlprCbVxlan(PNET_BUFFER_LIST curNbl,
348 UINT32 inPort,
349 OvsIPv4TunnelKey *tunKey,
350 PVOID cbData1,
351 PVOID cbData2,
352 NTSTATUS result,
353 POVS_FWD_INFO fwdInfo)
354 {
355 OVS_PACKET_HDR_INFO layers;
356 OvsFlowKey key;
357 NDIS_STATUS status;
358 UNREFERENCED_PARAMETER(inPort);
359
360 status = OvsExtractFlow(curNbl, inPort, &key, &layers, NULL);
361 if (result == STATUS_SUCCESS) {
362 status = OvsDoEncapVxlan(curNbl, tunKey, fwdInfo, &layers,
363 (POVS_SWITCH_CONTEXT)cbData1, NULL);
364 } else {
365 status = NDIS_STATUS_FAILURE;
366 }
367
368 if (status != NDIS_STATUS_SUCCESS) {
369 // XXX: Free up the NBL;
370 return;
371 }
372
373 OvsLookupFlowOutput((POVS_SWITCH_CONTEXT)cbData1, cbData2, curNbl);
374 }
375
376 /*
377 *----------------------------------------------------------------------------
378 * OvsCalculateUDPChecksum
379 * Calculate UDP checksum
380 *----------------------------------------------------------------------------
381 */
382 static __inline NDIS_STATUS
383 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
384 PNET_BUFFER curNb,
385 IPHdr *ipHdr,
386 UDPHdr *udpHdr,
387 UINT32 packetLength)
388 {
389 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
390 UINT16 checkSum;
391
392 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
393
394 /* Next check if UDP checksum has been calculated. */
395 if (!csumInfo.Receive.UdpChecksumSucceeded) {
396 UINT32 l4Payload;
397
398 checkSum = udpHdr->check;
399
400 l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
401 udpHdr->check = 0;
402 udpHdr->check =
403 IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
404 (UINT32 *)&ipHdr->daddr,
405 IPPROTO_UDP, (UINT16)l4Payload);
406 udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
407 sizeof(EthHdr) + ipHdr->ihl * 4);
408 if (checkSum != udpHdr->check) {
409 OVS_LOG_TRACE("UDP checksum incorrect.");
410 return NDIS_STATUS_INVALID_PACKET;
411 }
412 }
413
414 csumInfo.Receive.UdpChecksumSucceeded = 1;
415 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
416 return NDIS_STATUS_SUCCESS;
417 }
418
419 /*
420 *----------------------------------------------------------------------------
421 * OvsDecapVxlan
422 * Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
423 *----------------------------------------------------------------------------
424 */
425 NDIS_STATUS
426 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
427 PNET_BUFFER_LIST curNbl,
428 OvsIPv4TunnelKey *tunKey,
429 PNET_BUFFER_LIST *newNbl)
430 {
431 PNET_BUFFER curNb;
432 PMDL curMdl;
433 EthHdr *ethHdr;
434 IPHdr *ipHdr;
435 UDPHdr *udpHdr;
436 VXLANHdr *vxlanHdr;
437 UINT32 tunnelSize = 0, packetLength = 0;
438 PUINT8 bufferStart;
439 NDIS_STATUS status;
440
441 /* Check the the length of the UDP payload */
442 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
443 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
444 tunnelSize = OvsGetVxlanTunHdrSize();
445 if (packetLength <= tunnelSize) {
446 return NDIS_STATUS_INVALID_LENGTH;
447 }
448
449 /*
450 * Create a copy of the NBL so that we have all the headers in one MDL.
451 */
452 *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
453 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
454 TRUE /*copy NBL info */);
455
456 if (*newNbl == NULL) {
457 return NDIS_STATUS_RESOURCES;
458 }
459
460 /* XXX: Handle VLAN header. */
461 curNbl = *newNbl;
462 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
463 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
464 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
465 NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
466 if (!bufferStart) {
467 status = NDIS_STATUS_RESOURCES;
468 goto dropNbl;
469 }
470
471 ethHdr = (EthHdr *)bufferStart;
472 /* XXX: Handle IP options. */
473 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
474 tunKey->src = ipHdr->saddr;
475 tunKey->dst = ipHdr->daddr;
476 tunKey->tos = ipHdr->tos;
477 tunKey->ttl = ipHdr->ttl;
478 tunKey->pad = 0;
479 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
480
481 /* Validate if NIC has indicated checksum failure. */
482 status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
483 if (status != NDIS_STATUS_SUCCESS) {
484 goto dropNbl;
485 }
486
487 /* Calculate and verify UDP checksum if NIC didn't do it. */
488 if (udpHdr->check != 0) {
489 status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
490 if (status != NDIS_STATUS_SUCCESS) {
491 goto dropNbl;
492 }
493 }
494
495 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
496 if (vxlanHdr->instanceID) {
497 tunKey->flags = OVS_TNL_F_KEY;
498 tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
499 } else {
500 tunKey->flags = 0;
501 tunKey->tunnelId = 0;
502 }
503
504 /* Clear out the receive flag for the inner packet. */
505 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
506 NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
507 return NDIS_STATUS_SUCCESS;
508
509 dropNbl:
510 OvsCompleteNBL(switchContext, *newNbl, TRUE);
511 *newNbl = NULL;
512 return status;
513 }
514
515
516 NDIS_STATUS
517 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
518 OvsIPv4TunnelKey *tunnelKey)
519 {
520 NDIS_STATUS status = NDIS_STATUS_FAILURE;
521 UDPHdr udpStorage;
522 const UDPHdr *udp;
523 VXLANHdr *VxlanHeader;
524 VXLANHdr VxlanHeaderBuffer;
525 struct IPHdr ip_storage;
526 const struct IPHdr *nh;
527 OVS_PACKET_HDR_INFO layers;
528
529 layers.value = 0;
530
531 do {
532 nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
533 if (nh) {
534 layers.l4Offset = layers.l3Offset + nh->ihl * 4;
535 } else {
536 break;
537 }
538
539 /* make sure it's a VXLAN packet */
540 udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
541 if (udp) {
542 layers.l7Offset = layers.l4Offset + sizeof *udp;
543 } else {
544 break;
545 }
546
547 VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
548 sizeof(*VxlanHeader),
549 layers.l7Offset,
550 &VxlanHeaderBuffer);
551
552 if (VxlanHeader) {
553 tunnelKey->src = nh->saddr;
554 tunnelKey->dst = nh->daddr;
555 tunnelKey->ttl = nh->ttl;
556 tunnelKey->tos = nh->tos;
557 if (VxlanHeader->instanceID) {
558 tunnelKey->flags = OVS_TNL_F_KEY;
559 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
560 } else {
561 tunnelKey->flags = 0;
562 tunnelKey->tunnelId = 0;
563 }
564 } else {
565 break;
566 }
567 status = NDIS_STATUS_SUCCESS;
568
569 } while(FALSE);
570
571 return status;
572 }
573
574 #pragma warning( pop )