]> git.proxmox.com Git - ovs.git/blob - datapath-windows/ovsext/Vxlan.c
bump version to 2.15.0+ds1-2+deb11u3.1
[ovs.git] / datapath-windows / ovsext / Vxlan.c
1 /*
2 * Copyright (c) 2014, 2016 VMware, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "precomp.h"
18
19 #include "Atomic.h"
20 #include "Debug.h"
21 #include "Flow.h"
22 #include "IpHelper.h"
23 #include "NetProto.h"
24 #include "Offload.h"
25 #include "PacketIO.h"
26 #include "PacketParser.h"
27 #include "Switch.h"
28 #include "User.h"
29 #include "Vport.h"
30 #include "Vxlan.h"
31
32 #pragma warning( push )
33 #pragma warning( disable:4127 )
34
35
36 #ifdef OVS_DBG_MOD
37 #undef OVS_DBG_MOD
38 #endif
39 #define OVS_DBG_MOD OVS_DBG_VXLAN
40
41 /* Helper macro to check if a VXLAN ID is valid. */
42 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
43 #define VXLAN_TUNNELID_TO_VNI(_tID) (UINT32)(((UINT64)(_tID)) >> 40)
44 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
45 #define IP_DF_NBO 0x0040
46 #define VXLAN_DEFAULT_TTL 64
47 #define VXLAN_MULTICAST_TTL 64
48 #define VXLAN_DEFAULT_INSTANCE_ID 1
49
50 /* Move to a header file */
51 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
52
53 /*
54 *----------------------------------------------------------------------------
55 * This function verifies if the VXLAN tunnel already exists, in order to
56 * avoid sending a duplicate request to the WFP base filtering engine.
57 *----------------------------------------------------------------------------
58 */
59 static BOOLEAN
60 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext,
61 UINT16 udpPortDest)
62 {
63 for (UINT hash = 0; hash < OVS_MAX_VPORT_ARRAY_SIZE; hash++) {
64 PLIST_ENTRY head, link, next;
65
66 head = &(switchContext->portNoHashArray[hash & OVS_VPORT_MASK]);
67 LIST_FORALL_SAFE(head, link, next) {
68 POVS_VPORT_ENTRY vport = NULL;
69 POVS_VXLAN_VPORT vxlanPort = NULL;
70 vport = CONTAINING_RECORD(link, OVS_VPORT_ENTRY, portNoLink);
71 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
72 if (vxlanPort) {
73 if ((udpPortDest == vxlanPort->dstPort)) {
74 /* The VXLAN tunnel was already created. */
75 return TRUE;
76 }
77 }
78 }
79 }
80
81 return FALSE;
82 }
83
84 /*
85 *----------------------------------------------------------------------------
86 * This function allocates and initializes the OVS_VXLAN_VPORT. The function
87 * also creates a WFP tunnel filter for the necessary destination port. The
88 * tunnel filter create request is passed to the tunnel filter threads that
89 * will complete the request at a later time when IRQL is lowered to
90 * PASSIVE_LEVEL.
91 *
92 * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
93 * port of an udp frame is udpDestPort, we understand it to be vxlan.
94 *----------------------------------------------------------------------------
95 */
96 NTSTATUS
97 OvsInitVxlanTunnel(PIRP irp,
98 POVS_VPORT_ENTRY vport,
99 UINT16 udpDestPort,
100 PFNTunnelVportPendingOp callback,
101 PVOID tunnelContext)
102 {
103 NTSTATUS status = STATUS_SUCCESS;
104 POVS_VXLAN_VPORT vxlanPort = NULL;
105
106 vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
107 OVS_VXLAN_POOL_TAG);
108 if (vxlanPort == NULL) {
109 return STATUS_INSUFFICIENT_RESOURCES;
110 }
111
112 RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
113 vxlanPort->dstPort = udpDestPort;
114 vport->priv = (PVOID)vxlanPort;
115
116 if (!OvsIsTunnelFilterCreated(gOvsSwitchContext, udpDestPort)) {
117 status = OvsTunnelFilterCreate(irp,
118 udpDestPort,
119 &vxlanPort->filterID,
120 callback,
121 tunnelContext);
122 } else {
123 status = STATUS_OBJECT_NAME_EXISTS;
124 }
125
126 return status;
127 }
128
129 /*
130 *----------------------------------------------------------------------------
131 * This function releases the OVS_VXLAN_VPORT. The function also deletes the
132 * WFP tunnel filter previously created. The tunnel filter delete request is
133 * passed to the tunnel filter threads that will complete the request at a
134 * later time when IRQL is lowered to PASSIVE_LEVEL.
135 *----------------------------------------------------------------------------
136 */
137 NTSTATUS
138 OvsCleanupVxlanTunnel(PIRP irp,
139 POVS_VPORT_ENTRY vport,
140 PFNTunnelVportPendingOp callback,
141 PVOID tunnelContext)
142 {
143 NTSTATUS status = STATUS_SUCCESS;
144 POVS_VXLAN_VPORT vxlanPort = NULL;
145
146 if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
147 vport->priv == NULL) {
148 return STATUS_SUCCESS;
149 }
150
151 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
152
153 if (vxlanPort->filterID != 0) {
154 status = OvsTunnelFilterDelete(irp,
155 vxlanPort->filterID,
156 callback,
157 tunnelContext);
158 } else {
159 OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
160 vport->priv = NULL;
161 }
162
163 return status;
164 }
165
166
167 /*
168 *----------------------------------------------------------------------------
169 * OvsDoEncapVxlan
170 * Encapsulates the packet.
171 *----------------------------------------------------------------------------
172 */
173 static __inline NDIS_STATUS
174 OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
175 PNET_BUFFER_LIST curNbl,
176 OvsIPv4TunnelKey *tunKey,
177 POVS_FWD_INFO fwdInfo,
178 POVS_PACKET_HDR_INFO layers,
179 POVS_SWITCH_CONTEXT switchContext,
180 PNET_BUFFER_LIST *newNbl)
181 {
182 NDIS_STATUS status;
183 PNET_BUFFER curNb;
184 PMDL curMdl;
185 PUINT8 bufferStart;
186 EthHdr *ethHdr;
187 IPHdr *ipHdr;
188 UDPHdr *udpHdr;
189 VXLANHdr *vxlanHdr;
190 POVS_VXLAN_VPORT vportVxlan;
191 UINT32 headRoom = OvsGetVxlanTunHdrSize();
192 UINT32 packetLength;
193 ULONG mss = 0;
194 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
195
196 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
197 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
198
199 if (layers->isTcp) {
200 mss = OVSGetTcpMSS(curNbl);
201
202 OVS_LOG_TRACE("MSS %u packet len %u", mss,
203 packetLength);
204 if (mss) {
205 OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
206 *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
207 mss, headRoom, FALSE);
208 if (*newNbl == NULL) {
209 OVS_LOG_ERROR("Unable to segment NBL");
210 return NDIS_STATUS_FAILURE;
211 }
212 /* Clear out LSO flags after this point */
213 NET_BUFFER_LIST_INFO(*newNbl, TcpLargeSendNetBufferListInfo) = 0;
214 }
215 }
216
217 vportVxlan = (POVS_VXLAN_VPORT) GetOvsVportPriv(vport);
218 ASSERT(vportVxlan);
219
220 /* If we didn't split the packet above, make a copy now */
221 if (*newNbl == NULL) {
222 *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
223 FALSE /*NBL info*/);
224 if (*newNbl == NULL) {
225 OVS_LOG_ERROR("Unable to copy NBL");
226 return NDIS_STATUS_FAILURE;
227 }
228 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
229 TcpIpChecksumNetBufferListInfo);
230 status = OvsApplySWChecksumOnNB(layers, *newNbl, &csumInfo);
231
232 if (status != NDIS_STATUS_SUCCESS) {
233 goto ret_error;
234 }
235 }
236
237 curNbl = *newNbl;
238 for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
239 curNb = curNb->Next) {
240 status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
241 if (status != NDIS_STATUS_SUCCESS) {
242 goto ret_error;
243 }
244
245 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
246 bufferStart = (PUINT8)OvsGetMdlWithLowPriority(curMdl);
247 if (!bufferStart) {
248 status = NDIS_STATUS_RESOURCES;
249 goto ret_error;
250 }
251
252 bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
253 if (NET_BUFFER_NEXT_NB(curNb)) {
254 OVS_LOG_TRACE("nb length %u next %u",
255 NET_BUFFER_DATA_LENGTH(curNb),
256 NET_BUFFER_DATA_LENGTH(curNb->Next));
257 }
258
259 /* L2 header */
260 ethHdr = (EthHdr *)bufferStart;
261 NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
262 sizeof ethHdr->Destination);
263 NdisMoveMemory(ethHdr->Source, fwdInfo->srcMacAddr,
264 sizeof ethHdr->Source);
265 ethHdr->Type = htons(ETH_TYPE_IPV4);
266
267 /* IP header */
268 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
269
270 ipHdr->ihl = sizeof *ipHdr / 4;
271 ipHdr->version = IPPROTO_IPV4;
272 ipHdr->tos = tunKey->tos;
273 ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
274 ipHdr->id = (uint16)atomic_add64(&vportVxlan->ipId,
275 NET_BUFFER_DATA_LENGTH(curNb));
276 ipHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
277 IP_DF_NBO : 0;
278 ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
279 ipHdr->protocol = IPPROTO_UDP;
280 ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
281 ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
282 ipHdr->saddr = fwdInfo->srcIpAddr;
283 ipHdr->daddr = fwdInfo->dstIpAddr;
284
285 ipHdr->check = 0;
286
287 /* UDP header */
288 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
289 udpHdr->source = htons(tunKey->flow_hash | MAXINT16);
290 udpHdr->dest = tunKey->dst_port ? tunKey->dst_port :
291 htons(vportVxlan->dstPort);
292 udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
293 sizeof *udpHdr + sizeof *vxlanHdr);
294
295 if (tunKey->flags & OVS_TNL_F_CSUM) {
296 udpHdr->check = IPPseudoChecksum(&ipHdr->saddr, &ipHdr->daddr,
297 IPPROTO_UDP, ntohs(udpHdr->len));
298 } else {
299 udpHdr->check = 0;
300 }
301
302 /* VXLAN header */
303 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
304 vxlanHdr->flags1 = 0;
305 vxlanHdr->locallyReplicate = 0;
306 vxlanHdr->flags2 = 0;
307 vxlanHdr->reserved1 = 0;
308 vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
309 vxlanHdr->instanceID = 1;
310 vxlanHdr->reserved2 = 0;
311 }
312
313 csumInfo.Value = 0;
314 csumInfo.Transmit.IpHeaderChecksum = 1;
315 csumInfo.Transmit.IsIPv4 = 1;
316 if (tunKey->flags & OVS_TNL_F_CSUM) {
317 csumInfo.Transmit.UdpChecksum = 1;
318 }
319 NET_BUFFER_LIST_INFO(curNbl,
320 TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
321
322
323 return STATUS_SUCCESS;
324
325 ret_error:
326 OvsCompleteNBL(switchContext, *newNbl, TRUE);
327 *newNbl = NULL;
328 return status;
329 }
330
331
332 /*
333 *----------------------------------------------------------------------------
334 * OvsEncapVxlan --
335 * Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
336 * enqueues a callback that does encapsulation after resolution.
337 *----------------------------------------------------------------------------
338 */
339 NDIS_STATUS
340 OvsEncapVxlan(POVS_VPORT_ENTRY vport,
341 PNET_BUFFER_LIST curNbl,
342 OvsIPv4TunnelKey *tunKey,
343 POVS_SWITCH_CONTEXT switchContext,
344 POVS_PACKET_HDR_INFO layers,
345 PNET_BUFFER_LIST *newNbl,
346 POVS_FWD_INFO switchFwdInfo)
347 {
348 NTSTATUS status;
349 OVS_FWD_INFO fwdInfo;
350
351 status = OvsLookupIPFwdInfo(tunKey->src, tunKey->dst, &fwdInfo);
352 if (status != STATUS_SUCCESS) {
353 OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
354 /*
355 * XXX: Don't know if the completionList will make any sense when
356 * accessed in the callback. Make sure the caveats are known.
357 *
358 * XXX: This code will work once we are able to grab locks in the
359 * callback.
360 */
361 return NDIS_STATUS_FAILURE;
362 }
363
364 RtlCopyMemory(switchFwdInfo->value, fwdInfo.value, sizeof fwdInfo.value);
365
366 return OvsDoEncapVxlan(vport, curNbl, tunKey, &fwdInfo, layers,
367 switchContext, newNbl);
368 }
369
370
371 /*
372 *----------------------------------------------------------------------------
373 * OvsDecapVxlan
374 * Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
375 *----------------------------------------------------------------------------
376 */
377 NDIS_STATUS
378 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
379 PNET_BUFFER_LIST curNbl,
380 OvsIPv4TunnelKey *tunKey,
381 PNET_BUFFER_LIST *newNbl)
382 {
383 PNET_BUFFER curNb;
384 PMDL curMdl;
385 EthHdr *ethHdr;
386 IPHdr *ipHdr;
387 UDPHdr *udpHdr;
388 VXLANHdr *vxlanHdr;
389 UINT32 tunnelSize, packetLength;
390 PUINT8 bufferStart;
391 NDIS_STATUS status;
392 OVS_PACKET_HDR_INFO layers = { 0 };
393
394 status = OvsExtractLayers(curNbl, &layers);
395 if (status != NDIS_STATUS_SUCCESS) {
396 return status;
397 }
398
399 /* Check the length of the UDP payload */
400 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
401 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
402 tunnelSize = OvsGetVxlanTunHdrSizeFromLayers(&layers);
403 if (packetLength < tunnelSize) {
404 return NDIS_STATUS_INVALID_LENGTH;
405 }
406
407 /*
408 * Create a copy of the NBL so that we have all the headers in one MDL.
409 */
410 *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
411 tunnelSize, 0,
412 TRUE /*copy NBL info */);
413
414 if (*newNbl == NULL) {
415 return NDIS_STATUS_RESOURCES;
416 }
417
418 /* XXX: Handle VLAN header. */
419 curNbl = *newNbl;
420 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
421 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
422 bufferStart = (PUINT8)OvsGetMdlWithLowPriority(curMdl)
423 + NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
424 if (!bufferStart) {
425 status = NDIS_STATUS_RESOURCES;
426 goto dropNbl;
427 }
428
429 ethHdr = (EthHdr *)bufferStart;
430 /* XXX: Handle IP options. */
431 ipHdr = (IPHdr *)(bufferStart + layers.l3Offset);
432 tunKey->src = ipHdr->saddr;
433 tunKey->dst = ipHdr->daddr;
434 tunKey->tos = ipHdr->tos;
435 tunKey->ttl = ipHdr->ttl;
436 tunKey->pad = 0;
437 udpHdr = (UDPHdr *)(bufferStart + layers.l4Offset);
438
439 /* Validate if NIC has indicated checksum failure. */
440 status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
441 if (status != NDIS_STATUS_SUCCESS) {
442 goto dropNbl;
443 }
444
445 /* Calculate and verify UDP checksum if NIC didn't do it. */
446 if (udpHdr->check != 0) {
447 tunKey->flags |= OVS_TNL_F_CSUM;
448 status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr,
449 packetLength, &layers);
450 if (status != NDIS_STATUS_SUCCESS) {
451 goto dropNbl;
452 }
453 }
454
455 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
456 if (vxlanHdr->instanceID) {
457 tunKey->flags |= OVS_TNL_F_KEY;
458 tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
459 } else {
460 tunKey->flags &= ~OVS_TNL_F_KEY;
461 tunKey->tunnelId = 0;
462 }
463
464 /* Clear out the receive flag for the inner packet. */
465 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
466 NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
467 return NDIS_STATUS_SUCCESS;
468
469 dropNbl:
470 OvsCompleteNBL(switchContext, *newNbl, TRUE);
471 *newNbl = NULL;
472 return status;
473 }
474
475
476 NDIS_STATUS
477 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
478 OvsIPv4TunnelKey *tunnelKey)
479 {
480 NDIS_STATUS status = NDIS_STATUS_FAILURE;
481 UDPHdr udpStorage;
482 const UDPHdr *udp;
483 VXLANHdr *VxlanHeader;
484 VXLANHdr VxlanHeaderBuffer;
485 struct IPHdr ip_storage;
486 const struct IPHdr *nh;
487 OVS_PACKET_HDR_INFO layers;
488
489 layers.value = 0;
490
491 do {
492 nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
493 if (nh) {
494 layers.l4Offset = layers.l3Offset + nh->ihl * 4;
495 } else {
496 status = NDIS_STATUS_INVALID_PACKET;
497 break;
498 }
499
500 /* make sure it's a VXLAN packet */
501 udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
502 if (udp) {
503 layers.l7Offset = layers.l4Offset + sizeof *udp;
504 if (udp->check != 0) {
505 tunnelKey->flags |= OVS_TNL_F_CSUM;
506 }
507 } else {
508 break;
509 }
510
511 VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
512 sizeof(*VxlanHeader),
513 layers.l7Offset,
514 &VxlanHeaderBuffer);
515
516 if (VxlanHeader) {
517 tunnelKey->src = nh->saddr;
518 tunnelKey->dst = nh->daddr;
519 tunnelKey->ttl = nh->ttl;
520 tunnelKey->tos = nh->tos;
521 if (VxlanHeader->instanceID) {
522 tunnelKey->flags |= OVS_TNL_F_KEY;
523 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
524 } else {
525 tunnelKey->flags &= ~OVS_TNL_F_KEY;
526 tunnelKey->tunnelId = 0;
527 }
528 } else {
529 break;
530 }
531 status = NDIS_STATUS_SUCCESS;
532
533 } while(FALSE);
534
535 return status;
536 }
537
538 #pragma warning( pop )