2 * Copyright (c) 2017 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "Conntrack.h"
19 #include "IpFragment.h"
22 #include "PacketParser.h"
27 #define OVS_DBG_MOD OVS_DBG_IPFRAG
28 /* Based on MIN_FRAGMENT_SIZE.*/
29 #define MAX_FRAGMENTS 164
30 #define MIN_FRAGMENT_SIZE 400
31 #define MAX_IPDATAGRAM_SIZE 65535
33 /* Function declarations */
34 static KSTART_ROUTINE OvsIpFragmentEntryCleaner
;
35 static VOID
OvsIpFragmentEntryDelete(POVS_IPFRAG_ENTRY entry
, BOOLEAN checkExpiry
);
37 /* Global and static variables */
38 static OVS_IPFRAG_THREAD_CTX ipFragThreadCtx
;
39 static PNDIS_RW_LOCK_EX ovsIpFragmentHashLockObj
;
40 static UINT64 ipTotalEntries
;
41 static PLIST_ENTRY OvsIpFragTable
;
44 OvsInitIpFragment(POVS_SWITCH_CONTEXT context
)
48 HANDLE threadHandle
= NULL
;
50 /* Init the sync-lock */
51 ovsIpFragmentHashLockObj
= NdisAllocateRWLock(context
->NdisFilterHandle
);
52 if (ovsIpFragmentHashLockObj
== NULL
) {
53 return STATUS_INSUFFICIENT_RESOURCES
;
56 /* Init the Hash Buffer */
57 OvsIpFragTable
= OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY
)
58 * IP_FRAG_HASH_TABLE_SIZE
,
60 if (OvsIpFragTable
== NULL
) {
61 NdisFreeRWLock(ovsIpFragmentHashLockObj
);
62 ovsIpFragmentHashLockObj
= NULL
;
63 return STATUS_INSUFFICIENT_RESOURCES
;
66 for (int i
= 0; i
< IP_FRAG_HASH_TABLE_SIZE
; i
++) {
67 InitializeListHead(&OvsIpFragTable
[i
]);
70 /* Init Cleaner Thread */
71 KeInitializeEvent(&ipFragThreadCtx
.event
, NotificationEvent
, FALSE
);
72 status
= PsCreateSystemThread(&threadHandle
, SYNCHRONIZE
, NULL
, NULL
,
73 NULL
, OvsIpFragmentEntryCleaner
,
76 if (status
!= STATUS_SUCCESS
) {
77 OvsFreeMemoryWithTag(OvsIpFragTable
, OVS_IPFRAG_POOL_TAG
);
78 OvsIpFragTable
= NULL
;
79 NdisFreeRWLock(ovsIpFragmentHashLockObj
);
80 ovsIpFragmentHashLockObj
= NULL
;
84 ObReferenceObjectByHandle(threadHandle
, SYNCHRONIZE
, NULL
, KernelMode
,
85 &ipFragThreadCtx
.threadObject
, NULL
);
86 ZwClose(threadHandle
);
88 return STATUS_SUCCESS
;
91 static __inline UINT32
92 OvsGetIPFragmentHash(POVS_IPFRAG_KEY fragKey
)
95 arr
[0] = (UINT32
)fragKey
->protocol
;
96 arr
[1] = (UINT32
)fragKey
->id
;
97 arr
[2] = (UINT32
)fragKey
->sAddr
;
98 arr
[3] = (UINT32
)fragKey
->dAddr
;
99 arr
[4] = (UINT32
)((fragKey
->tunnelId
& 0xFFFFFFFF00000000LL
) >> 32);
100 arr
[5] = (UINT32
)(fragKey
->tunnelId
& 0xFFFFFFFFLL
);
101 return OvsJhashWords(arr
, 6, OVS_HASH_BASIS
);
104 static __inline POVS_IPFRAG_ENTRY
105 OvsLookupIPFrag(POVS_IPFRAG_KEY fragKey
, UINT32 hash
)
107 POVS_IPFRAG_ENTRY entry
;
109 LOCK_STATE_EX lockState
;
111 NdisAcquireRWLockRead(ovsIpFragmentHashLockObj
, &lockState
, 0);
112 LIST_FORALL(&OvsIpFragTable
[hash
& IP_FRAG_HASH_TABLE_MASK
], link
) {
113 entry
= CONTAINING_RECORD(link
, OVS_IPFRAG_ENTRY
, link
);
114 NdisAcquireSpinLock(&(entry
->lockObj
));
115 if (entry
->fragKey
.dAddr
== fragKey
->dAddr
&&
116 entry
->fragKey
.sAddr
== fragKey
->sAddr
&&
117 entry
->fragKey
.id
== fragKey
->id
&&
118 entry
->fragKey
.protocol
== fragKey
->protocol
&&
119 entry
->fragKey
.tunnelId
== fragKey
->tunnelId
) {
120 NdisReleaseSpinLock(&(entry
->lockObj
));
121 NdisReleaseRWLock(ovsIpFragmentHashLockObj
, &lockState
);
124 NdisReleaseSpinLock(&(entry
->lockObj
));
126 NdisReleaseRWLock(ovsIpFragmentHashLockObj
, &lockState
);
131 *----------------------------------------------------------------------------
133 * Reassemble the ipv4 fragments and return newNbl on success.
134 * Should be called after acquiring the lockObj for the entry.
135 *----------------------------------------------------------------------------
138 OvsIpv4Reassemble(POVS_SWITCH_CONTEXT switchContext
,
139 PNET_BUFFER_LIST
*curNbl
,
140 OvsCompletionList
*completionList
,
141 NDIS_SWITCH_PORT_ID sourcePort
,
142 POVS_IPFRAG_ENTRY entry
,
143 PNET_BUFFER_LIST
*newNbl
)
145 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
146 NDIS_STRING filterReason
;
147 POVS_BUFFER_CONTEXT ctx
;
150 IPHdr
*ipHdr
, *newIpHdr
;
151 CHAR
*ethBuf
[sizeof(EthHdr
)];
153 UINT16 ipHdrLen
, packetHeader
;
154 POVS_FRAGMENT_LIST head
= NULL
;
157 curNb
= NET_BUFFER_LIST_FIRST_NB(*curNbl
);
158 ASSERT(NET_BUFFER_NEXT_NB(curNb
) == NULL
);
160 eth
= (EthHdr
*)NdisGetDataBuffer(curNb
, ETH_HEADER_LENGTH
,
161 (PVOID
)ðBuf
, 1, 0);
163 return NDIS_STATUS_INVALID_PACKET
;
165 ipHdr
= (IPHdr
*)((PCHAR
)eth
+ ETH_HEADER_LENGTH
);
167 return NDIS_STATUS_INVALID_PACKET
;
169 ipHdrLen
= ipHdr
->ihl
* 4;
170 if (ipHdrLen
+ entry
->totalLen
> MAX_IPDATAGRAM_SIZE
) {
171 return NDIS_STATUS_INVALID_LENGTH
;
173 packetLen
= ETH_HEADER_LENGTH
+ ipHdrLen
+ entry
->totalLen
;
174 packetBuf
= (CHAR
*)OvsAllocateMemoryWithTag(packetLen
,
175 OVS_IPFRAG_POOL_TAG
);
176 if (packetBuf
== NULL
) {
177 OVS_LOG_ERROR("Insufficient resources, failed to allocate packetBuf");
178 return NDIS_STATUS_RESOURCES
;
181 /* copy Ethernet header */
182 NdisMoveMemory(packetBuf
, eth
, ETH_HEADER_LENGTH
);
183 /* copy ipv4 header to packet buff */
184 NdisMoveMemory(packetBuf
+ ETH_HEADER_LENGTH
, ipHdr
, ipHdrLen
);
186 /* update new ip header */
187 newIpHdr
= (IPHdr
*)(packetBuf
+ ETH_HEADER_LENGTH
);
188 newIpHdr
->frag_off
= 0;
189 newIpHdr
->tot_len
= htons(packetLen
- ETH_HEADER_LENGTH
);
191 newIpHdr
->check
= IPChecksum((UINT8
*)packetBuf
+ ETH_HEADER_LENGTH
,
193 packetHeader
= ETH_HEADER_LENGTH
+ ipHdrLen
;
196 if ((UINT32
)(packetHeader
+ head
->offset
) > packetLen
) {
197 status
= NDIS_STATUS_INVALID_DATA
;
200 NdisMoveMemory(packetBuf
+ packetHeader
+ head
->offset
,
201 head
->pbuff
, head
->len
);
204 /* Create new nbl from the flat buffer */
205 *newNbl
= OvsAllocateNBLFromBuffer(switchContext
, packetBuf
, packetLen
);
206 if (*newNbl
== NULL
) {
207 OVS_LOG_ERROR("Insufficient resources, failed to allocate newNbl");
208 status
= NDIS_STATUS_RESOURCES
;
212 /* Complete the fragment NBL */
213 ctx
= (POVS_BUFFER_CONTEXT
)NET_BUFFER_LIST_CONTEXT_DATA_START(*curNbl
);
214 if (ctx
->flags
& OVS_BUFFER_NEED_COMPLETE
) {
215 RtlInitUnicodeString(&filterReason
, L
"Complete last fragment");
216 OvsAddPktCompletionList(completionList
, TRUE
, sourcePort
, *curNbl
, 1,
219 OvsCompleteNBL(switchContext
, *curNbl
, TRUE
);
221 /* Store mru in the ovs buffer context. */
222 ctx
= (POVS_BUFFER_CONTEXT
)NET_BUFFER_LIST_CONTEXT_DATA_START(*newNbl
);
223 ctx
->mru
= entry
->mru
;
226 OvsFreeMemoryWithTag(packetBuf
, OVS_IPFRAG_POOL_TAG
);
227 entry
->markedForDelete
= TRUE
;
231 *----------------------------------------------------------------------------
232 * OvsProcessIpv4Fragment
233 * Reassemble the fragments once all the fragments are recieved and
234 * return NDIS_STATUS_PENDING for the pending fragments
235 * XXX - Instead of copying NBls, Keep the NBLs in limbo state.
236 *----------------------------------------------------------------------------
239 OvsProcessIpv4Fragment(POVS_SWITCH_CONTEXT switchContext
,
240 PNET_BUFFER_LIST
*curNbl
,
241 OvsCompletionList
*completionList
,
242 NDIS_SWITCH_PORT_ID sourcePort
,
244 PNET_BUFFER_LIST
*newNbl
)
246 NDIS_STATUS status
= NDIS_STATUS_PENDING
;
248 CHAR
*ethBuf
[sizeof(EthHdr
)];
249 UINT16 offset
, flags
;
250 UINT16 payloadLen
, ipHdrLen
;
255 OVS_IPFRAG_KEY fragKey
;
256 POVS_IPFRAG_ENTRY entry
;
257 POVS_FRAGMENT_LIST fragStorage
;
258 LOCK_STATE_EX htLockState
;
260 curNb
= NET_BUFFER_LIST_FIRST_NB(*curNbl
);
261 ASSERT(NET_BUFFER_NEXT_NB(curNb
) == NULL
);
263 eth
= (EthHdr
*)NdisGetDataBuffer(curNb
, ETH_HEADER_LENGTH
,
264 (PVOID
)ðBuf
, 1, 0);
266 return NDIS_STATUS_INVALID_PACKET
;
269 ipHdr
= (IPHdr
*)((PCHAR
)eth
+ ETH_HEADER_LENGTH
);
271 return NDIS_STATUS_INVALID_PACKET
;
273 ipHdrLen
= ipHdr
->ihl
* 4;
274 payloadLen
= ntohs(ipHdr
->tot_len
) - ipHdrLen
;
275 offset
= ntohs(ipHdr
->frag_off
) & IP_OFFSET
;
277 flags
= ntohs(ipHdr
->frag_off
) & IP_MF
;
278 /* Only the last fragment can be of smaller size.*/
279 if (flags
&& ntohs(ipHdr
->tot_len
) < MIN_FRAGMENT_SIZE
) {
280 return NDIS_STATUS_INVALID_LENGTH
;
282 /*Copy fragment specific fields. */
283 fragKey
.protocol
= ipHdr
->protocol
;
284 fragKey
.id
= ipHdr
->id
;
285 fragKey
.sAddr
= ipHdr
->saddr
;
286 fragKey
.dAddr
= ipHdr
->daddr
;
287 fragKey
.tunnelId
= tunnelId
;
289 NdisZeroMemory(&fragKey
.pad_1
, 3);
292 fragStorage
= (POVS_FRAGMENT_LIST
)
293 OvsAllocateMemoryWithTag(sizeof(OVS_FRAGMENT_LIST
),
294 OVS_IPFRAG_POOL_TAG
);
295 if (fragStorage
== NULL
) {
296 OVS_LOG_ERROR("Insufficient resources, fail to allocate fragStorage");
297 return NDIS_STATUS_RESOURCES
;
300 fragStorage
->pbuff
= (CHAR
*)OvsAllocateMemoryWithTag(payloadLen
,
301 OVS_IPFRAG_POOL_TAG
);
302 if (fragStorage
->pbuff
== NULL
) {
303 OVS_LOG_ERROR("Insufficient resources, fail to allocate pbuff");
304 OvsFreeMemoryWithTag(fragStorage
, OVS_IPFRAG_POOL_TAG
);
305 return NDIS_STATUS_RESOURCES
;
308 /* Copy payload from nbl to fragment storage. */
309 if (OvsGetPacketBytes(*curNbl
, payloadLen
, ETH_HEADER_LENGTH
+ ipHdrLen
,
310 fragStorage
->pbuff
) == NULL
) {
311 status
= NDIS_STATUS_RESOURCES
;
312 goto payload_copy_error
;
314 fragStorage
->len
= payloadLen
;
315 fragStorage
->offset
= offset
;
316 fragStorage
->next
= NULL
;
317 hash
= OvsGetIPFragmentHash(&fragKey
);
318 entry
= OvsLookupIPFrag(&fragKey
, hash
);
320 entry
= (POVS_IPFRAG_ENTRY
)
321 OvsAllocateMemoryWithTag(sizeof(OVS_IPFRAG_ENTRY
),
322 OVS_IPFRAG_POOL_TAG
);
324 status
= NDIS_STATUS_RESOURCES
;
325 goto payload_copy_error
;
327 /* Copy the fragmeny key. */
328 NdisZeroMemory(entry
, sizeof(OVS_IPFRAG_ENTRY
));
329 NdisMoveMemory(&(entry
->fragKey
), &fragKey
,
330 sizeof(OVS_IPFRAG_KEY
));
332 entry
->mru
= ETH_HEADER_LENGTH
+ ipHdrLen
+ payloadLen
;
333 entry
->recvdLen
+= fragStorage
->len
;
334 entry
->head
= entry
->tail
= fragStorage
;
335 entry
->numFragments
= 1;
337 entry
->totalLen
= offset
+ payloadLen
;
339 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
340 entry
->expiration
= currentTime
+ IPFRAG_ENTRY_TIMEOUT
;
342 /* Init the sync-lock. */
343 NdisAllocateSpinLock(&(entry
->lockObj
));
344 NdisAcquireRWLockWrite(ovsIpFragmentHashLockObj
, &htLockState
, 0);
345 InsertHeadList(&OvsIpFragTable
[hash
& IP_FRAG_HASH_TABLE_MASK
],
349 NdisReleaseRWLock(ovsIpFragmentHashLockObj
, &htLockState
);
350 return NDIS_STATUS_PENDING
;
352 /* Acquire the entry lock. */
353 NdisAcquireSpinLock(&(entry
->lockObj
));
354 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
355 if (currentTime
> entry
->expiration
|| entry
->numFragments
== MAX_FRAGMENTS
) {
356 /* Mark the entry for delete. */
357 entry
->markedForDelete
= TRUE
;
360 POVS_FRAGMENT_LIST next
= entry
->head
;
361 POVS_FRAGMENT_LIST prev
= entry
->tail
;
362 if (prev
!= NULL
&& prev
->offset
< offset
) {
367 for (next
= entry
->head
; next
!= NULL
; next
= next
->next
) {
368 if (next
->offset
> fragStorage
->offset
) {
374 /*Check for overlap. */
376 /* i bytes overlap. */
377 int i
= (prev
->offset
+ prev
->len
) - fragStorage
->offset
;
383 /* i bytes overlap. */
384 int i
= (fragStorage
->offset
+ fragStorage
->len
) - next
->offset
;
390 if (entry
->recvdLen
+ fragStorage
->len
> entry
->recvdLen
) {
391 entry
->recvdLen
+= fragStorage
->len
;
393 /* Overflow, ignore the fragment.*/
399 prev
->next
= fragStorage
;
400 fragStorage
->next
= next
;
402 fragStorage
->next
= next
;
403 entry
->head
= fragStorage
;
406 entry
->tail
= fragStorage
;
409 /*Update Maximum recieved Unit */
410 entry
->mru
= entry
->mru
> (ETH_HEADER_LENGTH
+ ipHdrLen
+ payloadLen
) ?
411 entry
->mru
: (ETH_HEADER_LENGTH
+ ipHdrLen
+ payloadLen
);
412 entry
->numFragments
++;
414 entry
->totalLen
= offset
+ payloadLen
;
416 if (entry
->recvdLen
== entry
->totalLen
) {
417 status
= OvsIpv4Reassemble(switchContext
, curNbl
, completionList
,
418 sourcePort
, entry
, newNbl
);
420 NdisReleaseSpinLock(&(entry
->lockObj
));
424 status
= NDIS_STATUS_INVALID_PACKET
;
425 /* Release the entry lock. */
426 NdisReleaseSpinLock(&(entry
->lockObj
));
428 OvsFreeMemoryWithTag(fragStorage
->pbuff
, OVS_IPFRAG_POOL_TAG
);
429 OvsFreeMemoryWithTag(fragStorage
, OVS_IPFRAG_POOL_TAG
);
435 *----------------------------------------------------------------------------
436 * OvsIpFragmentEntryCleaner
437 * Runs periodically and cleans up the Ip Fragment table
438 * Interval is selected as twice the entry timeout
439 *----------------------------------------------------------------------------
442 OvsIpFragmentEntryCleaner(PVOID data
)
445 POVS_IPFRAG_THREAD_CTX context
= (POVS_IPFRAG_THREAD_CTX
)data
;
446 PLIST_ENTRY link
, next
;
447 POVS_IPFRAG_ENTRY entry
;
448 LOCK_STATE_EX lockState
;
449 BOOLEAN success
= TRUE
;
452 if (ovsIpFragmentHashLockObj
== NULL
) {
453 /* Lock has been freed by 'OvsCleanupIpFragment()' */
456 NdisAcquireRWLockWrite(ovsIpFragmentHashLockObj
, &lockState
, 0);
458 NdisReleaseRWLock(ovsIpFragmentHashLockObj
, &lockState
);
462 /* Set the timeout for the thread and cleanup. */
463 UINT64 currentTime
, threadSleepTimeout
;
464 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
465 threadSleepTimeout
= currentTime
+ IPFRAG_CLEANUP_INTERVAL
;
466 for (int i
= 0; i
< IP_FRAG_HASH_TABLE_SIZE
&& ipTotalEntries
; i
++) {
467 LIST_FORALL_SAFE(&OvsIpFragTable
[i
], link
, next
) {
468 entry
= CONTAINING_RECORD(link
, OVS_IPFRAG_ENTRY
, link
);
469 OvsIpFragmentEntryDelete(entry
, TRUE
);
473 NdisReleaseRWLock(ovsIpFragmentHashLockObj
, &lockState
);
474 KeWaitForSingleObject(&context
->event
, Executive
, KernelMode
,
475 FALSE
, (LARGE_INTEGER
*)&threadSleepTimeout
);
478 PsTerminateSystemThread(STATUS_SUCCESS
);
482 OvsIpFragmentEntryDelete(POVS_IPFRAG_ENTRY entry
, BOOLEAN checkExpiry
)
484 NdisAcquireSpinLock(&(entry
->lockObj
));
485 if (!entry
->markedForDelete
&& checkExpiry
) {
487 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
488 if (entry
->expiration
> currentTime
) {
489 NdisReleaseSpinLock(&(entry
->lockObj
));
494 POVS_FRAGMENT_LIST head
= entry
->head
;
495 POVS_FRAGMENT_LIST temp
= NULL
;
499 OvsFreeMemoryWithTag(temp
->pbuff
, OVS_IPFRAG_POOL_TAG
);
500 OvsFreeMemoryWithTag(temp
, OVS_IPFRAG_POOL_TAG
);
502 RemoveEntryList(&entry
->link
);
504 NdisReleaseSpinLock(&(entry
->lockObj
));
505 NdisFreeSpinLock(&(entry
->lockObj
));
506 OvsFreeMemoryWithTag(entry
, OVS_IPFRAG_POOL_TAG
);
510 OvsCleanupIpFragment(VOID
)
512 PLIST_ENTRY link
, next
;
513 POVS_IPFRAG_ENTRY entry
;
514 LOCK_STATE_EX lockState
;
516 ipFragThreadCtx
.exit
= 1;
517 KeSetEvent(&ipFragThreadCtx
.event
, 0, FALSE
);
518 KeWaitForSingleObject(ipFragThreadCtx
.threadObject
, Executive
,
519 KernelMode
, FALSE
, NULL
);
520 ObDereferenceObject(ipFragThreadCtx
.threadObject
);
521 NdisAcquireRWLockWrite(ovsIpFragmentHashLockObj
, &lockState
, 0);
522 if (OvsIpFragTable
) {
523 for (int i
= 0; i
< IP_FRAG_HASH_TABLE_SIZE
&& ipTotalEntries
; i
++) {
524 LIST_FORALL_SAFE(&OvsIpFragTable
[i
], link
, next
) {
525 entry
= CONTAINING_RECORD(link
, OVS_IPFRAG_ENTRY
, link
);
526 OvsIpFragmentEntryDelete(entry
, FALSE
);
529 OvsFreeMemoryWithTag(OvsIpFragTable
, OVS_IPFRAG_POOL_TAG
);
530 OvsIpFragTable
= NULL
;
532 NdisReleaseRWLock(ovsIpFragmentHashLockObj
, &lockState
);
533 NdisFreeRWLock(ovsIpFragmentHashLockObj
);
534 ovsIpFragmentHashLockObj
= NULL
;